# **Libraries**

In [36]:
import pandas as pd
import numpy as np

from feature_engine.selection import DropCorrelatedFeatures
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

import lightgbm as lgb
from sklearn.metrics import roc_auc_score

from feature_engine.encoding import WoEEncoder

import functions
import importlib
importlib.reload(functions)

import time

import warnings

# **Display**

In [17]:
%matplotlib inline

pd.options.display.max_rows = 300000
pd.options.display.max_columns = 999
pd.options.display.max_colwidth = 500

warnings.filterwarnings("ignore")
warnings.simplefilter(action="ignore", category=FutureWarning)

pd.set_option('display.max_rows', 200)

size = 20

# **Data**

## **Load Data**

In [76]:
app_train = pd.read_csv(
    r"C:\Users\Dell\Documents\AI\Risk\Data\application_train.csv",
    index_col=False
)

app_test = pd.read_csv(
    r"C:\Users\Dell\Documents\AI\Risk\Data\application_test.csv",
    index_col=False
)

# **Variables**

In [77]:
data = app_train.copy()
random_state = 101
target = 'TARGET'

## **Remove Empty Features**

In [78]:
list_columns = functions.check_columns_with_one_uniquevalue(app_test)

In [79]:
data = data.drop(list_columns, axis = 1)

## **Drop Collinear Features**

In [80]:
dcf = DropCorrelatedFeatures(threshold=0.7)
data = dcf.fit_transform(data)

## **Drop Unneeded Features**

In [81]:
data.drop('SK_ID_CURR', axis=1, inplace=True)

## **Reduce Memory Usage**

In [82]:
data = functions.reduce_memory_usage(data)

Memory usage of dataframe is 159.54 MB
Memory usage after optimization is: 63.35 MB
Decreased by 60.3%


## **Missing Values**

In [68]:
functions.MissingValues(data)

Unnamed: 0,NumberMissing,PercentageMissing
COMMONAREA_AVG,214865,69.87
NONLIVINGAPARTMENTS_AVG,213514,69.43
FONDKAPREMONT_MODE,210295,68.39
YEARS_BUILD_AVG,204488,66.5
OWN_CAR_AGE,202929,65.99
LANDAREA_AVG,182590,59.38
BASEMENTAREA_AVG,179943,58.52
EXT_SOURCE_1,173378,56.38
NONLIVINGAREA_AVG,169682,55.18
WALLSMATERIAL_MODE,156341,50.84


In [83]:
num_value = -99999

for col in data.select_dtypes(include=['float16', 'float32', 'float64']).columns:
    data[col].fillna(num_value, inplace=True)

In [84]:
cat_value = 'UNKNOWN'

for col in data.select_dtypes(include=['object']).columns:
    data[col].fillna(cat_value, inplace=True)

## **WoE Encoder**

In [85]:
woe = WoEEncoder(fill_value=0.0001)
woe.fit(data, data[target])
data = woe.transform(data)

## **Train Test Split**

In [72]:
X = data.drop('TARGET', axis=1)
y = data['TARGET']

X, y = shuffle(X, y, random_state=random_state)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=random_state)

### **LGBM**

In [73]:
model = lgb.LGBMClassifier(boosting_type='gbdt', 
                           num_leaves=31, 
                           max_depth=-1, 
                           learning_rate=0.1, 
                           n_estimators=100,
                           verbose=-1)

model.fit(X_train, y_train)

y_prob = model.predict_proba(X_test)[:, 1]  

auc_score = roc_auc_score(y_test, y_prob)
print(f"AUC Score: {auc_score:.2f}")

AUC Score: 0.75


## **Feature Importance**

In [74]:
# Get feature importance and sort by importance
feature_importance = model.feature_importances_
feature_names = X.columns

# Create a DataFrame to hold feature importance values
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

importance_df

Unnamed: 0,Feature,Importance
34,EXT_SOURCE_3,256
33,EXT_SOURCE_2,239
32,EXT_SOURCE_1,229
13,DAYS_BIRTH,209
6,AMT_ANNUITY,156
14,DAYS_EMPLOYED,150
16,DAYS_ID_PUBLISH,143
51,DAYS_LAST_PHONE_CHANGE,112
31,ORGANIZATION_TYPE,101
15,DAYS_REGISTRATION,101


## **Drop Columns**

In [86]:
columns = ['FLAG_DOCUMENT_5', 
                   'FLAG_DOCUMENT_6', 
                   'LIVE_CITY_NOT_WORK_CITY', 
                   'AMT_REQ_CREDIT_BUREAU_DAY',
                   'FLAG_DOCUMENT_8',
                   'FLAG_MOBIL',
                   'FLAG_CONT_MOBILE',
                   'REG_REGION_NOT_LIVE_REGION',
                   'FLAG_DOCUMENT_9',
                   'FLAG_DOCUMENT_4',
                   'FLAG_DOCUMENT_7'] 
data = data.drop(columns=columns)

### **LGBM**

In [87]:
X = data.drop('TARGET', axis=1)
y = data['TARGET']

X, y = shuffle(X, y, random_state=random_state)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=random_state)

In [88]:
model = lgb.LGBMClassifier(boosting_type='gbdt', 
                           num_leaves=31, 
                           max_depth=-1, 
                           learning_rate=0.1, 
                           n_estimators=100,
                           verbose=-1)

model.fit(X_train, y_train)

y_prob = model.predict_proba(X_test)[:, 1]  

auc_score = roc_auc_score(y_test, y_prob)
print(f"AUC Score: {auc_score:.2f}")

AUC Score: 0.75


## **Feature Importance**

In [89]:
# Get feature importance and sort by importance
feature_importance = model.feature_importances_
feature_names = X.columns

# Create a DataFrame to hold feature importance values
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

importance_df

Unnamed: 0,Feature,Importance
30,EXT_SOURCE_3,256
28,EXT_SOURCE_1,240
29,EXT_SOURCE_2,234
13,DAYS_BIRTH,219
6,AMT_ANNUITY,168
14,DAYS_EMPLOYED,157
16,DAYS_ID_PUBLISH,135
47,DAYS_LAST_PHONE_CHANGE,131
15,DAYS_REGISTRATION,119
27,ORGANIZATION_TYPE,95
