# **Libraries**

In [2]:
import pandas as pd
import numpy as np

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

import lightgbm as lgb
from sklearn.metrics import roc_auc_score

from feature_engine.encoding import WoEEncoder
from feature_engine.imputation import CategoricalImputer
from feature_engine.imputation import ArbitraryNumberImputer

import functions
import importlib
importlib.reload(functions)

import time

import warnings

# **Display**

In [3]:
%matplotlib inline

pd.options.display.max_rows = 300000
pd.options.display.max_columns = 999
pd.options.display.max_colwidth = 500

warnings.filterwarnings("ignore")
warnings.simplefilter(action="ignore", category=FutureWarning)

pd.set_option('display.max_rows', 200)

size = 20

# **Data**

## **Load Data**

In [4]:
train = pd.read_csv(
    r"C:\Users\Dell\Documents\AI\Risk\Data\application_train.csv",
    index_col=False
)

bureau = pd.read_csv(
    r"C:\Users\Dell\Documents\AI\Risk\Data\bureau.csv",
    index_col=False
)

balance = pd.read_csv(
    r"C:\Users\Dell\Documents\AI\Risk\Data\bureau_balance.csv",
    index_col=False
)

# **Variables**

In [5]:
random_state = 101
target = 'TARGET'

## **Reduce Memory Usage**

In [6]:
train = functions.reduce_memory_usage(train)

Memory usage of dataframe is 286.23 MB
Memory usage after optimization is: 92.38 MB
Decreased by 67.7%


In [7]:
bureau = functions.reduce_memory_usage(bureau)

Memory usage of dataframe is 222.62 MB
Memory usage after optimization is: 112.95 MB
Decreased by 49.3%


In [8]:
balance = functions.reduce_memory_usage(balance)

Memory usage of dataframe is 624.85 MB
Memory usage after optimization is: 338.46 MB
Decreased by 45.8%


## **Missing Values**

In [9]:
functions.MissingValues(balance)

Unnamed: 0,NumberMissing,PercentageMissing,DataType


## **Imputation**

In [9]:
ani = ArbitraryNumberImputer(arbitrary_number=-99999)
ani.fit(train)
train = ani.transform(train)

In [10]:
ani = ArbitraryNumberImputer(arbitrary_number=-99999)
ani.fit(bureau)
bureau = ani.transform(bureau)

In [11]:
ani = ArbitraryNumberImputer(arbitrary_number=-99999)
ani.fit(balance)
balance = ani.transform(balance)

In [12]:
ci = CategoricalImputer(imputation_method='missing', fill_value='UNKNOWN')
ci.fit(train)
train = ci.transform(train)

In [13]:
ci = CategoricalImputer(imputation_method='missing', fill_value='UNKNOWN')
ci.fit(bureau)
bureau = ci.transform(bureau)

In [14]:
ci = CategoricalImputer(imputation_method='missing', fill_value='UNKNOWN')
ci.fit(balance)
balance = ci.transform(balance)

In [15]:
functions.MissingValues(train)

Unnamed: 0,NumberMissing,PercentageMissing,DataType


In [16]:
functions.MissingValues(bureau)

Unnamed: 0,NumberMissing,PercentageMissing,DataType


In [17]:
functions.MissingValues(balance)

Unnamed: 0,NumberMissing,PercentageMissing,DataType


## **Aggregation**

In [None]:
balance_agg = balance.groupby('SK_ID_BUREAU').agg(
    Num_Months=('MONTHS_BALANCE', 'count'),
    Num_Statuses_Months=('STATUS', lambda x: (x.isin(['1', '2', '3', '4', '5'])).sum()),
    Max_DPD_Months=('STATUS', lambda x: x.replace({'C': -1, 'X': -1}).astype(int).max()),
    Num_Closed_Months=('STATUS', lambda x: (x == 'C').sum()),
    Num_No_Loan_Months=('STATUS', lambda x: (x == 'X').sum()),
)

bureau_full = bureau.merge(balance_agg, on='SK_ID_BUREAU', how='left')

bureau_client_agg = bureau_full.groupby('SK_ID_CURR').agg(
    Total_Num_Months=('Num_Months', 'sum'),
    Total_Num_Statuses_Months=('Num_Statuses_Months', 'sum'),
    Avg_Max_DPD_Months=('Max_DPD_Months', 'mean'),
    Total_Num_Closed_Months=('Num_Closed_Months', 'sum'),
    Total_Num_No_Loan_Months=('Num_No_Loan_Months', 'sum'),
)

bureau_client_agg.head()

## **Merge**

In [19]:
data = train.merge(bureau_client_agg, on='SK_ID_CURR', how='left')

In [24]:
data.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,APARTMENTS_MODE,BASEMENTAREA_MODE,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_MODE,COMMONAREA_MODE,ELEVATORS_MODE,ENTRANCES_MODE,FLOORSMAX_MODE,FLOORSMIN_MODE,LANDAREA_MODE,LIVINGAPARTMENTS_MODE,LIVINGAREA_MODE,NONLIVINGAPARTMENTS_MODE,NONLIVINGAREA_MODE,APARTMENTS_MEDI,BASEMENTAREA_MEDI,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BUILD_MEDI,COMMONAREA_MEDI,ELEVATORS_MEDI,ENTRANCES_MEDI,FLOORSMAX_MEDI,FLOORSMIN_MEDI,LANDAREA_MEDI,LIVINGAPARTMENTS_MEDI,LIVINGAREA_MEDI,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAREA_MEDI,FONDKAPREMONT_MODE,HOUSETYPE_MODE,TOTALAREA_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,total_n_months,total_n_arrears_months,avg_max_dpd_months,total_n_closed_months,total_n_no_loan_months
0,100002,1,0.036236,0.250931,0.056242,-0.015093,0,202500.0,406597.5,24700.5,351000.0,0.014753,0.188675,0.111494,0.213706,-0.037947,0.018799,-9461,-637,-3648.0,-2120,-inf,1,1,0,1,1,0,0.297977,1.0,2,2,0.011729,10,0,0,0,0,0,0,0.154898,0.083008,0.262939,0.139404,0.024704,0.036896,0.972168,0.619141,0.014297,0.0,0.06897,0.083313,0.125,0.036896,0.020203,0.018997,0.0,0.0,0.025208,0.0383,0.972168,0.634277,0.014397,0.0,0.06897,0.083313,0.125,0.037689,0.022003,0.019806,0.0,0.0,0.024994,0.036896,0.972168,0.624512,0.014397,0.0,0.06897,0.083313,0.125,0.037506,0.020493,0.019302,0.0,0.0,-0.157558,-0.162933,0.0149,-0.093493,-0.159608,2.0,2.0,2.0,2.0,-1134.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0,110.0,27.0,0.75,23.0,15.0
1,100003,0,0.036236,-0.154307,0.056242,0.03349,0,270000.0,1293502.5,35698.5,1129500.0,-0.080601,-0.36335,-0.439594,-0.071222,-0.037947,0.003542,-16765,-1188,-1186.0,-291,-inf,1,1,0,1,1,0,-0.266393,2.0,1,1,-0.043314,11,0,0,0,0,0,0,-0.334264,0.311279,0.62207,-inf,0.095886,0.052887,0.984863,0.795898,0.060486,0.080017,0.034485,0.291748,0.333252,0.013,0.077271,0.054901,0.003901,0.009804,0.092407,0.053802,0.984863,0.804199,0.049713,0.080627,0.034485,0.291748,0.333252,0.012802,0.078979,0.055389,0.0,0.0,0.096802,0.052887,0.984863,0.798828,0.060791,0.080017,0.034485,0.291748,0.333252,0.013199,0.078674,0.055786,0.003901,0.010002,-0.157558,-0.162933,0.071411,-0.150408,-0.159608,1.0,0.0,1.0,0.0,-828.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
2,100004,0,-0.415543,0.250931,-0.117353,-0.015093,0,67500.0,135000.0,6750.0,135000.0,0.014753,0.188675,0.111494,0.213706,-0.037947,0.010033,-19046,-225,-4260.0,-2531,26.0,1,1,1,1,1,0,0.297977,1.0,2,2,-0.043314,9,0,0,0,0,0,0,-0.157575,-inf,0.556152,0.729492,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,0.071397,0.137172,-inf,0.134395,0.15028,0.0,0.0,0.0,0.0,-815.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
3,100006,0,0.036236,-0.154307,0.056242,-0.015093,0,135000.0,312682.5,29686.5,297000.0,0.014753,0.188675,0.111494,0.229088,-0.037947,0.008018,-19005,-3039,-9832.0,-2437,-inf,1,1,0,1,0,0,0.297977,2.0,2,2,0.011729,17,0,0,0,0,0,0,0.154898,-inf,0.650391,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,0.071397,0.137172,-inf,0.134395,0.15028,2.0,0.0,2.0,0.0,-617.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-inf,-inf,-inf,-inf,-inf,-inf,,,,,
4,100007,0,0.036236,0.250931,0.056242,-0.015093,0,121500.0,513000.0,21865.5,513000.0,0.014753,0.188675,0.111494,0.213706,-0.037947,0.028656,-19932,-3038,-4312.0,-3458,-inf,1,1,0,1,0,0,-0.266393,1.0,2,2,0.003683,11,0,0,0,0,1,1,-0.340103,-inf,0.322754,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,0.071397,0.137172,-inf,0.134395,0.15028,0.0,0.0,0.0,0.0,-1106.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0


## **WoE Encoder**

In [20]:
woe = WoEEncoder(fill_value=0.0001)
woe.fit(data, data[target])
data = woe.transform(data)

## **Train Test Split**

In [21]:
X = data.drop('TARGET', axis=1)
y = data['TARGET']

X, y = shuffle(X, y, random_state=random_state)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=random_state)

### **LGBM**

In [22]:
model = lgb.LGBMClassifier(boosting_type='gbdt', 
                           num_leaves=31, 
                           max_depth=-1, 
                           learning_rate=0.1, 
                           n_estimators=100,
                           verbose=-1)

model.fit(X_train, y_train)

y_prob = model.predict_proba(X_test)[:, 1]  

auc_score = roc_auc_score(y_test, y_prob)
print(f"AUC Score: {auc_score:.2f}")

AUC Score: 0.76


## **Feature Importance**

In [23]:
feature_importance = model.feature_importances_
feature_names = X.columns

importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

importance_df

Unnamed: 0,Feature,Importance
42,EXT_SOURCE_3,212
41,EXT_SOURCE_2,198
40,EXT_SOURCE_1,182
16,DAYS_BIRTH,167
7,AMT_CREDIT,158
8,AMT_ANNUITY,144
9,AMT_GOODS_PRICE,123
17,DAYS_EMPLOYED,112
19,DAYS_ID_PUBLISH,112
18,DAYS_REGISTRATION,88
