# **Libraries**

In [2]:
import pandas as pd
import numpy as np

from feature_engine.selection import DropCorrelatedFeatures
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

import lightgbm as lgb
from sklearn.metrics import roc_auc_score

from feature_engine.encoding import WoEEncoder

import functions
import importlib
importlib.reload(functions)

import time

import warnings

# **Display**

In [3]:
%matplotlib inline

pd.options.display.max_rows = 300000
pd.options.display.max_columns = 999
pd.options.display.max_colwidth = 500

warnings.filterwarnings("ignore")
warnings.simplefilter(action="ignore", category=FutureWarning)

pd.set_option('display.max_rows', 200)

size = 20

# **Data**

## **Load Data**

In [4]:
app_train = pd.read_csv(
    r"C:\Users\Dell\Documents\AI\Risk\Data\application_train.csv",
    index_col=False
)

# previous_apps = pd.read_csv(
#     r"C:\Users\Dell\Documents\AI\Risk\Data\previous_application.csv",
#     index_col=False
# )

# app_test = pd.read_csv(
#     r"C:\Users\Dell\Documents\AI\Risk\Data\application_test.csv",
#     index_col=False
# )

In [5]:
app_train.shape

(307511, 122)

# **Variables**

In [6]:
random_state = 101
target = 'TARGET'

## **Remove Empty Features**

In [5]:
# list_columns = functions.check_columns_with_one_uniquevalue(app_test)

In [6]:
# app_train = app_train.drop(list_columns, axis = 1)

In [7]:
from feature_engine.selection import DropConstantFeatures

dcf = DropConstantFeatures(tol = 0.75)
dcf.fit_transform(app_train)

ValueError: Some of the variables in the dataset contain NaN. Check and remove those before using this transformer.

In [29]:
app_train.shape

(307511, 69)

## **Drop Collinear Features**

In [7]:
dcf = DropCorrelatedFeatures(threshold=0.7)
previous_apps = dcf.fit_transform(previous_apps)
app_train = dcf.fit_transform(app_train)

## **Reduce Memory Usage**

In [8]:
previous_apps = functions.reduce_memory_usage(previous_apps)

app_train = functions.reduce_memory_usage(app_train)

Memory usage of dataframe is 407.77 MB
Memory usage after optimization is: 277.15 MB
Decreased by 32.0%
Memory usage of dataframe is 161.88 MB
Memory usage after optimization is: 64.52 MB
Decreased by 60.1%


## **Missing Values**

In [None]:
functions.MissingValues(previous_apps)

## **Drop Features (More than 50% Missing)**

In [9]:
columns = ['RATE_INTEREST_PRIMARY', 'RATE_INTEREST_PRIVILEGED', 'RATE_DOWN_PAYMENT', 'AMT_DOWN_PAYMENT']
previous_apps = previous_apps.drop(columns, axis=1)

## **Impute Missing Values**

In [10]:
num_value = -99999

for col in previous_apps.select_dtypes(include=['float16', 'float32', 'float64']).columns:
    previous_apps[col].fillna(num_value, inplace=True)

In [11]:
num_value = -99999

for col in app_train.select_dtypes(include=['float16', 'float32', 'float64']).columns:
    app_train[col].fillna(num_value, inplace=True)

In [None]:
cat_value = 'UNKNOWN'

for col in previous_apps.select_dtypes(include=['object']).columns:
    previous_apps[col].fillna(cat_value, inplace=True)

In [12]:
cat_value = 'UNKNOWN'

for col in app_train.select_dtypes(include=['object']).columns:
    app_train[col].fillna(cat_value, inplace=True)

## **Aggregation**

In [13]:
aggregated_previous_app = previous_apps.groupby('SK_ID_CURR').agg({
    'SK_ID_PREV': 'count',  
    'AMT_ANNUITY': 'mean',    
    'DAYS_DECISION': ['mean', 'max', 'min'],  
    'CNT_PAYMENT': ['sum'],
    'DAYS_FIRST_DUE': lambda x: x.max() - x.min(),
    'DAYS_LAST_DUE': lambda x: x.max() - x.min()
}).reset_index()

# Flatten multi-level columns generated by the aggregations
aggregated_previous_app.columns = ['_'.join(col).strip() if type(col) is tuple else col for col in aggregated_previous_app.columns]

aggregated_previous_app = aggregated_previous_app.rename(columns={
    'SK_ID_CURR_': 'SK_ID_CURR', 
    'SK_ID_PREV_count': 'NUM_PREVIOUS_APPLICATIONS',
    'AMT_ANNUITY_mean': 'AVG_ANNUITY_AMOUNT',
    'DAYS_DECISION_mean': 'AVG_DAYS_DECISION',
    'DAYS_DECISION_max': 'MAX_DAYS_DECISION',
    'DAYS_DECISION_min': 'MIN_DAYS_DECISION',
    'CNT_PAYMENT_sum': 'SUM_CNT_PAYMENT',
    'DAYS_FIRST_DUE_<lambda>': 'RANGE_DAYS_FIRST_DUE',
    'DAYS_LAST_DUE_<lambda>': 'RANGE_DAYS_LAST_DUE',
})

aggregated_previous_app.head()

Unnamed: 0,SK_ID_CURR,NUM_PREVIOUS_APPLICATIONS,AVG_ANNUITY_AMOUNT,AVG_DAYS_DECISION,MAX_DAYS_DECISION,MIN_DAYS_DECISION,SUM_CNT_PAYMENT,RANGE_DAYS_FIRST_DUE,RANGE_DAYS_LAST_DUE
0,100001,1,3951.0,-1740.0,-1740,-1740,8.0,0.0,0.0
1,100002,1,9251.775391,-606.0,-606,-606,24.0,0.0,0.0
2,100003,3,56553.988281,-1305.0,-746,-2341,30.0,1594.0,1444.0
3,100004,1,5357.25,-815.0,-815,-815,4.0,0.0,0.0
4,100005,2,-47592.898438,-536.0,-315,-757,-inf,99293.0,99533.0


## **Merge Application Train and Previous Application Data**

In [14]:
data = app_train.merge(aggregated_previous_app, on='SK_ID_CURR', how='left')

## **Drop Columns**

In [None]:
columns = ['FLAG_DOCUMENT_5', 
                    'FLAG_DOCUMENT_17',
                    'FLAG_DOCUMENT_15',
                    'FLAG_DOCUMENT_12',
                    'FLAG_DOCUMENT_14',
                    'FLAG_DOCUMENT_2',
                    'FLAG_DOCUMENT_10',
                    'FLAG_DOCUMENT_20',
                    'FLAG_DOCUMENT_19',
                   'LIVE_CITY_NOT_WORK_CITY', 
                   'AMT_REQ_CREDIT_BUREAU_DAY',
                   'FLAG_DOCUMENT_8',
                   'FLAG_MOBIL',
                   'FLAG_CONT_MOBILE',
                   'REG_REGION_NOT_LIVE_REGION',
                   'FLAG_DOCUMENT_9',
                   'FLAG_DOCUMENT_4',
                   'FLAG_DOCUMENT_7',
                   'FLAG_EMP_PHONE',
                   'REG_REGION_NOT_WORK_REGION',
                   'HOUSETYPE_MODE',
                   'FLOORSMIN_MODE',
                    'FLOORSMIN_MEDI',
                    'ENTRANCES_MEDI',
                    'FLOORSMAX_MODE'
                   ] 
data = data.drop(columns=columns)

## **WoE Encoder**

In [16]:
woe = WoEEncoder(fill_value=0.0001)
woe.fit(data, data[target])
data = woe.transform(data)

## **Train Test Split**

In [25]:
X = data.drop('TARGET', axis=1)
y = data['TARGET']

X, y = shuffle(X, y, random_state=random_state)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=random_state)

### **LGBM**

In [35]:
model = lgb.LGBMClassifier(boosting_type='gbdt', 
                           num_leaves=31, 
                           max_depth=-1, 
                           learning_rate=0.1, 
                           n_estimators=100,
                           verbose=-1)

model.fit(X_train, y_train)

y_prob = model.predict_proba(X_test)[:, 1]  

auc_score = roc_auc_score(y_test, y_prob)
print(f"AUC Score: {auc_score:.2f}")

AUC Score: 0.76


## **Feature Importance**

In [None]:
feature_importance = model.feature_importances_
feature_names = X.columns

importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

importance_df

## **Drop Columns**

In [None]:
columns = ['FLAG_DOCUMENT_5', 
                    'FLAG_DOCUMENT_17',
                    'FLAG_DOCUMENT_15',
                    'FLAG_DOCUMENT_12',
                    'FLAG_DOCUMENT_14',
                    'FLAG_DOCUMENT_2',
                    'FLAG_DOCUMENT_10',
                    'FLAG_DOCUMENT_20',
                    'FLAG_DOCUMENT_19',
                   'LIVE_CITY_NOT_WORK_CITY', 
                   'AMT_REQ_CREDIT_BUREAU_DAY',
                   'FLAG_DOCUMENT_8',
                   'FLAG_MOBIL',
                   'FLAG_CONT_MOBILE',
                   'REG_REGION_NOT_LIVE_REGION',
                   'FLAG_DOCUMENT_9',
                   'FLAG_DOCUMENT_4',
                   'FLAG_DOCUMENT_7',
                   'FLAG_EMP_PHONE',
                   'REG_REGION_NOT_WORK_REGION',
                   'HOUSETYPE_MODE',
                   'FLOORSMIN_MODE',
                    'FLOORSMIN_MEDI',
                    'ENTRANCES_MEDI',
                    'FLOORSMAX_MODE'
                   ] 
data = data.drop(columns=columns)

## **Recurssive Feature Elimination**

In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from feature_engine.selection import RecursiveFeatureElimination
X = pd.DataFrame(dict(x1 = [1000,2000,1000,1000,2000,3000],
                    x2 = [2,4,3,1,2,2],
                    x3 = [1,1,1,0,0,0],
                    x4 = [1,2,1,1,0,1],
                    x5 = [1,1,1,1,1,1]))
y = pd.Series([1,0,0,1,1,0])
rfe = RecursiveFeatureElimination(RandomForestClassifier(random_state=2), cv=2)
rfe.fit_transform(X, y)

Unnamed: 0,x2
0,2
1,4
2,3
3,1
4,2
5,2


In [2]:
# Import necessary libraries
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from feature_engine.selection import RecursiveFeatureElimination
from sklearn.metrics import make_scorer, roc_auc_score

# Load a sample dataset (Breast cancer dataset)
data = load_breast_cancer()
X, y = data.data, data.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scale the data using StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize the model for feature selection (Logistic Regression)
model = LogisticRegression(max_iter=2000, solver='lbfgs', random_state=42)

# Define a custom scorer using AUC with response_method='predict_proba' (for ROC AUC scoring)
auc_scorer = make_scorer(roc_auc_score, response_method='predict_proba')

# Initialize Recursive Feature Elimination with AUC as scoring
rfe = RecursiveFeatureElimination(
    estimator=model,
    variables=None,  # If None, RFE will evaluate all numerical features
    scoring=auc_scorer,  # Use AUC for scoring
    threshold=0.01,  # Feature importance threshold to drop variables
    cv=3  # Number of cross-validation folds
)

# Fit the RFE
rfe.fit(X_train, y_train)

# Transform the datasets to retain only selected features
X_train_selected = rfe.transform(X_train)
X_test_selected = rfe.transform(X_test)

# Display results
print("Original number of features:", X_train.shape[1])
print("Selected number of features:", X_train_selected.shape[1])


Original number of features: 30
Selected number of features: 2


In [None]:
# Import necessary libraries
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from feature_engine.selection import RecursiveFeatureElimination
from sklearn.metrics import make_scorer, roc_auc_score

# # Load a sample dataset (Breast cancer dataset)
# data = load_breast_cancer()
# X, y = data.data, data.target

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# # Scale the data using StandardScaler
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

# Initialize the model for feature selection (Logistic Regression)
model = lgb.LGBMClassifier(boosting_type='gbdt', 
                           num_leaves=31, 
                           max_depth=-1, 
                           learning_rate=0.1, 
                           n_estimators=100,
                           verbose=-1)

# Define a custom scorer using AUC with response_method='predict_proba' (for ROC AUC scoring)
auc_scorer = make_scorer(roc_auc_score, response_method='predict_proba')

# Initialize Recursive Feature Elimination with AUC as scoring
rfe = RecursiveFeatureElimination(
    estimator=model,
    variables=None,  # If None, RFE will evaluate all numerical features
    scoring=auc_scorer,  # Use AUC for scoring
    threshold=0.01,  # Feature importance threshold to drop variables
    cv=3,  # Number of cross-validation folds
    n_features_to_select=3
)

# Fit the RFE
rfe.fit(X_train, y_train)

# Transform the datasets to retain only selected features
X_train_selected = rfe.transform(X_train)
X_test_selected = rfe.transform(X_test)

# Display results
print("Original number of features:", X_train.shape[1])
print("Selected number of features:", X_train_selected.shape[1])


In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split

# # Sample dataset
# np.random.seed(42)
# X = pd.DataFrame({
#     'feature_1': np.random.rand(100),
#     'feature_2': np.random.rand(100),
#     'feature_3': np.random.rand(100),
#     'feature_4': np.random.rand(100),
#     'feature_5': np.random.rand(100),
# })

# # Create a target variable
# y = np.random.randint(2, size=100)

# # Split the dataset into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize RFE with a RandomForest model
rfe = RFE(estimator=RandomForestClassifier(), n_features_to_select=3)

# Fit RFE
rfe.fit(X_train, y_train)

# Transform the dataset
X_rfe = rfe.transform(X_train)

# Get selected features
selected_features = X.columns[rfe.support_]
print("Selected Features:", selected_features.tolist())
