# **Libraries**

In [1]:
import pandas as pd
import numpy as np

from feature_engine.selection import DropCorrelatedFeatures
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

import lightgbm as lgb
from sklearn.metrics import roc_auc_score

from feature_engine.encoding import WoEEncoder

import functions
import importlib
importlib.reload(functions)

import warnings

# **Display**

In [2]:
%matplotlib inline

pd.options.display.max_rows = 300000
pd.options.display.max_columns = 999
pd.options.display.max_colwidth = 500

warnings.filterwarnings("ignore")
warnings.simplefilter(action="ignore", category=FutureWarning)

pd.set_option('display.max_rows', 200)

size = 20

# **Data**

## **Load Data**

In [3]:
app_train = pd.read_csv(
    r"C:\Users\Dell\Documents\AI\Risk\Data\application_train.csv",
    index_col=False
)

app_test = pd.read_csv(
    r"C:\Users\Dell\Documents\AI\Risk\Data\application_test.csv",
    index_col=False
)

# **Variables**

In [4]:
data = app_train.copy()
random_state = 101
target = 'TARGET'

## **Remove Empty Features**

In [5]:
list_columns = functions.check_columns_with_one_uniquevalue(app_test)

In [6]:
data = data.drop(list_columns, axis = 1)

In [7]:
data.shape

(307511, 111)

## **Drop Collinear Features**

In [8]:
dcf = DropCorrelatedFeatures(threshold=0.7)
data = dcf.fit_transform(data)

## **Drop Unneeded Features**

In [9]:
data.drop('SK_ID_CURR', axis=1, inplace=True)

In [10]:
data.shape

(307511, 68)

## **Reduce Memory Usage**

In [11]:
data = functions.reduce_memory_usage(data)

Memory usage of dataframe is 159.54 MB
Memory usage after optimization is: 63.35 MB
Decreased by 60.3%


# **Data Cleaning**

In [12]:
data = data[data['CODE_GENDER'] != 'XNA']
data['DAYS_EMPLOYED'][data['DAYS_EMPLOYED'] == 365243] = np.nan

## **Missing Values**

In [13]:
functions.MissingValues(data)

Unnamed: 0,NumberMissing,PercentageMissing,DataType
COMMONAREA_AVG,214862,69.87,float16
NONLIVINGAPARTMENTS_AVG,213512,69.43,float16
FONDKAPREMONT_MODE,210293,68.39,object
YEARS_BUILD_AVG,204486,66.5,float16
OWN_CAR_AGE,202927,65.99,float16
LANDAREA_AVG,182588,59.38,float16
BASEMENTAREA_AVG,179942,58.52,float16
EXT_SOURCE_1,173376,56.38,float16
NONLIVINGAREA_AVG,169680,55.18,float16
WALLSMATERIAL_MODE,156340,50.84,object


In [14]:
num_value = -99999

for col in data.select_dtypes(include=['float16', 'float32', 'float64']).columns:
    data[col].fillna(num_value, inplace=True)

In [15]:
cat_value = 'UNKNOWN'

for col in data.select_dtypes(include=['object']).columns:
    data[col].fillna(cat_value, inplace=True)

## **WoE Encoder**

In [16]:
woe = WoEEncoder(fill_value=0.0001)
woe.fit(data, data[target])
data = woe.transform(data)

## **Train Test Split**

In [17]:
X = data.drop('TARGET', axis=1)
y = data['TARGET']

X, y = shuffle(X, y, random_state=random_state)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=random_state)

### **LGBM**

In [18]:
model = lgb.LGBMClassifier(boosting_type='gbdt', 
                           num_leaves=31, 
                           max_depth=-1, 
                           learning_rate=0.1, 
                           n_estimators=100,
                           verbose=-1)

model.fit(X_train, y_train)

y_prob = model.predict_proba(X_test)[:, 1]  

auc_score = roc_auc_score(y_test, y_prob)
print(f"AUC Score: {auc_score:.2f}")

AUC Score: 0.75


## **Feature Importance**

In [19]:
# Get feature importance and sort by importance
feature_importance = model.feature_importances_
feature_names = X.columns

# Create a DataFrame to hold feature importance values
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

importance_df

Unnamed: 0,Feature,Importance
34,EXT_SOURCE_3,240
33,EXT_SOURCE_2,238
32,EXT_SOURCE_1,220
13,DAYS_BIRTH,205
6,AMT_ANNUITY,175
16,DAYS_ID_PUBLISH,160
14,DAYS_EMPLOYED,155
51,DAYS_LAST_PHONE_CHANGE,104
15,DAYS_REGISTRATION,100
31,ORGANIZATION_TYPE,99


## **Drop Columns**

In [20]:
columns = ['FLAG_DOCUMENT_5', 
                   'FLAG_DOCUMENT_6', 
                   'LIVE_CITY_NOT_WORK_CITY', 
                   'AMT_REQ_CREDIT_BUREAU_DAY',
                   'FLAG_DOCUMENT_8',
                   'FLAG_MOBIL',
                   'FLAG_CONT_MOBILE',
                   'REG_REGION_NOT_LIVE_REGION',
                   'FLAG_DOCUMENT_9',
                   'FLAG_DOCUMENT_4',
                   'FLAG_DOCUMENT_7'] 
data = data.drop(columns=columns)

### **LGBM**

In [21]:
X = data.drop('TARGET', axis=1)
y = data['TARGET']

X, y = shuffle(X, y, random_state=random_state)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=random_state)

In [22]:
model = lgb.LGBMClassifier(boosting_type='gbdt', 
                           num_leaves=31, 
                           max_depth=-1, 
                           learning_rate=0.1, 
                           n_estimators=100,
                           verbose=-1)

model.fit(X_train, y_train)

y_prob = model.predict_proba(X_test)[:, 1]  

auc_score = roc_auc_score(y_test, y_prob)
print(f"AUC Score: {auc_score:.2f}")

AUC Score: 0.75


## **Feature Importance**

In [23]:
# Get feature importance and sort by importance
feature_importance = model.feature_importances_
feature_names = X.columns

# Create a DataFrame to hold feature importance values
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

importance_df

Unnamed: 0,Feature,Importance
30,EXT_SOURCE_3,242
29,EXT_SOURCE_2,240
28,EXT_SOURCE_1,223
13,DAYS_BIRTH,209
6,AMT_ANNUITY,192
14,DAYS_EMPLOYED,173
16,DAYS_ID_PUBLISH,146
47,DAYS_LAST_PHONE_CHANGE,113
15,DAYS_REGISTRATION,105
27,ORGANIZATION_TYPE,91


## **Drop Columns**

In [24]:
columns = ['FLAG_DOCUMENT_18', 
                   'FLAG_PHONE', 
                   'AMT_REQ_CREDIT_BUREAU_MON', 
                   'LIVE_REGION_NOT_WORK_REGION',
                   'FLAG_OWN_REALTY',
                   'EMERGENCYSTATE_MODE',
                   'AMT_REQ_CREDIT_BUREAU_WEEK',
                   'HOUSETYPE_MODE',
                   'FLAG_DOCUMENT_11',
                   'FLAG_EMAIL',
                ] 
data = data.drop(columns=columns)

### **LGBM**

In [25]:
X = data.drop('TARGET', axis=1)
y = data['TARGET']

X, y = shuffle(X, y, random_state=random_state)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=random_state)

In [26]:
model = lgb.LGBMClassifier(boosting_type='gbdt', 
                           num_leaves=31, 
                           max_depth=-1, 
                           learning_rate=0.1, 
                           n_estimators=100,
                           verbose=-1)

model.fit(X_train, y_train)

y_prob = model.predict_proba(X_test)[:, 1]  

auc_score = roc_auc_score(y_test, y_prob)
print(f"AUC Score: {auc_score:.2f}")

AUC Score: 0.75


## **Feature Importance**

In [27]:
# Get feature importance and sort by importance
feature_importance = model.feature_importances_
feature_names = X.columns

# Create a DataFrame to hold feature importance values
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

importance_df

Unnamed: 0,Feature,Importance
25,EXT_SOURCE_2,252
26,EXT_SOURCE_3,237
12,DAYS_BIRTH,222
24,EXT_SOURCE_1,213
5,AMT_ANNUITY,202
13,DAYS_EMPLOYED,161
15,DAYS_ID_PUBLISH,159
41,DAYS_LAST_PHONE_CHANGE,116
14,DAYS_REGISTRATION,112
23,ORGANIZATION_TYPE,92


## **Drop Columns**

In [28]:
columns = ['FLAG_OWN_CAR', 
                   'REGION_RATING_CLIENT', 
                   'COMMONAREA_AVG', 
                   'NAME_CONTRACT_TYPE',
                   'FLAG_DOCUMENT_3',
                   'FLOORSMAX_AVG',
                   'NONLIVINGAPARTMENTS_AVG',
                   'NAME_INCOME_TYPE',
                   'REG_CITY_NOT_LIVE_CITY',
                   'WALLSMATERIAL_MODE',
                   'CNT_CHILDREN',
                   'FLAG_WORK_PHONE',
                   'AMT_REQ_CREDIT_BUREAU_HOUR',
                   'NAME_TYPE_SUITE',
                   'FONDKAPREMONT_MODE'
                ] 
data = data.drop(columns=columns)

### **LGBM**

In [29]:
X = data.drop('TARGET', axis=1)
y = data['TARGET']

X, y = shuffle(X, y, random_state=random_state)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=random_state)

In [30]:
model = lgb.LGBMClassifier(boosting_type='gbdt', 
                           num_leaves=31, 
                           max_depth=-1, 
                           learning_rate=0.1, 
                           n_estimators=100,
                           verbose=-1)

model.fit(X_train, y_train)

y_prob = model.predict_proba(X_test)[:, 1]  

auc_score = roc_auc_score(y_test, y_prob)
print(f"AUC Score: {auc_score:.2f}")

AUC Score: 0.75


## **Feature Importance**

In [31]:
# Get feature importance and sort by importance
feature_importance = model.feature_importances_
feature_names = X.columns

# Create a DataFrame to hold feature importance values
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

importance_df

Unnamed: 0,Feature,Importance
18,EXT_SOURCE_3,271
17,EXT_SOURCE_2,253
16,EXT_SOURCE_1,249
7,DAYS_BIRTH,228
2,AMT_ANNUITY,217
8,DAYS_EMPLOYED,163
10,DAYS_ID_PUBLISH,151
9,DAYS_REGISTRATION,143
28,DAYS_LAST_PHONE_CHANGE,117
15,ORGANIZATION_TYPE,107


## **Drop Columns**

In [32]:
columns = ['NAME_FAMILY_STATUS', 
                   'AMT_REQ_CREDIT_BUREAU_YEAR', 
                   'CODE_GENDER', 
                   'DEF_30_CNT_SOCIAL_CIRCLE',
                   'OBS_30_CNT_SOCIAL_CIRCLE',
                   'ENTRANCES_AVG',
                   'AMT_REQ_CREDIT_BUREAU_QRT',
                   'NONLIVINGAREA_AVG',
                   'WEEKDAY_APPR_PROCESS_START',
                   'YEARS_BUILD_AVG',
                   'NAME_HOUSING_TYPE'
                ] 
data = data.drop(columns=columns)

### **LGBM**

In [33]:
X = data.drop('TARGET', axis=1)
y = data['TARGET']

X, y = shuffle(X, y, random_state=random_state)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=random_state)

In [34]:
model = lgb.LGBMClassifier(boosting_type='gbdt', 
                           num_leaves=31, 
                           max_depth=-1, 
                           learning_rate=0.1, 
                           n_estimators=100,
                           verbose=-1)

model.fit(X_train, y_train)

y_prob = model.predict_proba(X_test)[:, 1]  

auc_score = roc_auc_score(y_test, y_prob)
print(f"AUC Score: {auc_score:.2f}")

AUC Score: 0.75


## **Feature Importance**

In [35]:
# Get feature importance and sort by importance
feature_importance = model.feature_importances_
feature_names = X.columns

# Create a DataFrame to hold feature importance values
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

importance_df

Unnamed: 0,Feature,Importance
13,EXT_SOURCE_2,299
14,EXT_SOURCE_3,298
12,EXT_SOURCE_1,279
1,AMT_ANNUITY,252
4,DAYS_BIRTH,247
5,DAYS_EMPLOYED,200
7,DAYS_ID_PUBLISH,173
19,DAYS_LAST_PHONE_CHANGE,162
6,DAYS_REGISTRATION,155
11,ORGANIZATION_TYPE,119


## **Drop Columns**

In [36]:
columns = ['OWN_CAR_AGE', 
                   'APARTMENTS_AVG', 
                   'YEARS_BEGINEXPLUATATION_AVG', 
                   'HOUR_APPR_PROCESS_START',
                   'BASEMENTAREA_AVG',
                   'NAME_EDUCATION_TYPE',
                   'LANDAREA_AVG'
                ] 
data = data.drop(columns=columns)

### **LGBM**

In [37]:
X = data.drop('TARGET', axis=1)
y = data['TARGET']

X, y = shuffle(X, y, random_state=random_state)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=random_state)

In [38]:
model = lgb.LGBMClassifier(boosting_type='gbdt', 
                           num_leaves=31, 
                           max_depth=-1, 
                           learning_rate=0.1, 
                           n_estimators=100,
                           verbose=-1)

model.fit(X_train, y_train)

y_prob = model.predict_proba(X_test)[:, 1]  

auc_score = roc_auc_score(y_test, y_prob)
print(f"AUC Score: {auc_score:.2f}")

AUC Score: 0.74


## **Feature Importance**

In [39]:
# Get feature importance and sort by importance
feature_importance = model.feature_importances_
feature_names = X.columns

# Create a DataFrame to hold feature importance values
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

importance_df

Unnamed: 0,Feature,Importance
10,EXT_SOURCE_2,323
1,AMT_ANNUITY,313
11,EXT_SOURCE_3,310
9,EXT_SOURCE_1,308
3,DAYS_BIRTH,296
6,DAYS_ID_PUBLISH,236
4,DAYS_EMPLOYED,219
5,DAYS_REGISTRATION,212
12,DAYS_LAST_PHONE_CHANGE,197
0,AMT_INCOME_TOTAL,173


## **Missing Values**

In [40]:
functions.MissingValues(data)

Unnamed: 0,NumberMissing,PercentageMissing,DataType


In [41]:
num_value = -99999

for col in data.select_dtypes(include=['float16', 'float32', 'float64']).columns:
    data[col].fillna(num_value, inplace=True)

### **LGBM**

In [42]:
X = data.drop('TARGET', axis=1)
y = data['TARGET']

X, y = shuffle(X, y, random_state=random_state)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=random_state)

In [43]:
model = lgb.LGBMClassifier(boosting_type='gbdt', 
                           num_leaves=31, 
                           max_depth=-1, 
                           learning_rate=0.1, 
                           n_estimators=100,
                           verbose=-1)

model.fit(X_train, y_train)

y_prob = model.predict_proba(X_test)[:, 1]  

auc_score = roc_auc_score(y_test, y_prob)
print(f"AUC Score: {auc_score:.2f}")

AUC Score: 0.74


## **Feature Importance**

In [44]:
# Get feature importance and sort by importance
feature_importance = model.feature_importances_
feature_names = X.columns

# Create a DataFrame to hold feature importance values
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

importance_df

Unnamed: 0,Feature,Importance
10,EXT_SOURCE_2,323
1,AMT_ANNUITY,313
11,EXT_SOURCE_3,310
9,EXT_SOURCE_1,308
3,DAYS_BIRTH,296
6,DAYS_ID_PUBLISH,236
4,DAYS_EMPLOYED,219
5,DAYS_REGISTRATION,212
12,DAYS_LAST_PHONE_CHANGE,197
0,AMT_INCOME_TOTAL,173
