# **Libraries**

In [68]:
import pandas as pd
import numpy as np

from feature_engine.selection import DropCorrelatedFeatures
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

import lightgbm as lgb
from sklearn.metrics import roc_auc_score

from feature_engine.encoding import WoEEncoder
from feature_engine.imputation import CategoricalImputer
from feature_engine.imputation import ArbitraryNumberImputer

import functions
import importlib
importlib.reload(functions)

import warnings

# **Display**

In [69]:
%matplotlib inline

pd.options.display.max_rows = 300000
pd.options.display.max_columns = 999
pd.options.display.max_colwidth = 500

warnings.filterwarnings("ignore")
warnings.simplefilter(action="ignore", category=FutureWarning)

pd.set_option('display.max_rows', 200)

size = 20

# **Load Data**

In [183]:
train = pd.read_csv(
    r"C:\Users\Dell\Documents\AI\Risk\Data\application_train.csv",
    index_col=False
)


test = pd.read_csv(
    r"C:\Users\Dell\Documents\AI\Risk\Data\application_test.csv",
    index_col=False
)

# **Variables**

In [184]:
random_state = 101
target = 'TARGET'

## **Remove Empty Features**

In [185]:
list_columns = functions.check_columns_with_one_uniquevalue(test)

In [186]:
train = train.drop(list_columns, axis = 1)

## **Reduce Memory Usage**

In [187]:
train = functions.reduce_memory_usage(train)

Memory usage of dataframe is 260.42 MB
Memory usage after optimization is: 89.15 MB
Decreased by 65.8%


# **Data Cleaning**

In [188]:
train = train[train['CODE_GENDER'] != 'XNA']
train['DAYS_EMPLOYED'][train['DAYS_EMPLOYED'] == 365243] = np.nan

# **Feature Engineering**

## **Days Birth to Age in Years**

In [189]:
train["DAYS_BIRTH"] = abs(train["DAYS_BIRTH"])
train["DAYS_BIRTH"] = train["DAYS_BIRTH"]/365
train.rename(columns={'DAYS_BIRTH': 'Age'}, inplace=True)
train["Age"].describe()  

count    307507.000000
mean         43.937061
std          11.956116
min          20.517808
25%          34.008219
50%          43.150685
75%          53.923288
max          69.120548
Name: Age, dtype: float64

## **Days Employed to Years Employed**

In [190]:
train["DAYS_EMPLOYED"] = abs(train["DAYS_EMPLOYED"])
train["DAYS_EMPLOYED"] = train["DAYS_EMPLOYED"]/365
train.rename(columns={"DAYS_EMPLOYED": 'YEARS_EMPLOYED'}, inplace=True)
train["YEARS_EMPLOYED"].describe()  

count    252133.000000
mean          6.531897
std           6.406377
min           0.000000
25%           2.101370
50%           4.515068
75%           8.698630
max          49.073973
Name: YEARS_EMPLOYED, dtype: float64

## **Days ID Published to Years ID Published**

In [191]:
train["DAYS_ID_PUBLISH"] = abs(train["DAYS_ID_PUBLISH"])
train["DAYS_ID_PUBLISH"] = train["DAYS_ID_PUBLISH"]/365
train.rename(columns={"DAYS_ID_PUBLISH": 'YEARS_ID_PUBLISH'}, inplace=True)
train["YEARS_ID_PUBLISH"].describe()  

count    307507.000000
mean          8.203292
std           4.135492
min           0.000000
25%           4.712329
50%           8.915068
75%          11.778082
max          19.717808
Name: YEARS_ID_PUBLISH, dtype: float64

## **Days Registration to Years Registration**

In [192]:
train["DAYS_REGISTRATION"] = abs(train["DAYS_REGISTRATION"])
train["DAYS_REGISTRATION"] = train["DAYS_REGISTRATION"]/365
train.rename(columns={"DAYS_REGISTRATION": 'YEARS_REGISTRATION'}, inplace=True)
train["YEARS_REGISTRATION"].describe()  

count    307507.000000
mean               NaN
std           0.000000
min           0.000000
25%           5.507812
50%          12.335938
75%          20.500000
max          67.625000
Name: YEARS_REGISTRATION, dtype: float64

## **Days Last Phone Change to Years Last Phone Change**

In [193]:
train["DAYS_LAST_PHONE_CHANGE"] = abs(train["DAYS_LAST_PHONE_CHANGE"])
train["DAYS_LAST_PHONE_CHANGE"] = train["DAYS_LAST_PHONE_CHANGE"]/365
train.rename(columns={"DAYS_LAST_PHONE_CHANGE": 'YEARS_LAST_PHONE_CHANGE'}, inplace=True)
train["YEARS_LAST_PHONE_CHANGE"].describe()  

count    307506.000000
mean               NaN
std           0.000000
min           0.000000
25%           0.750488
50%           2.074219
75%           4.300781
max          11.757812
Name: YEARS_LAST_PHONE_CHANGE, dtype: float64

## **Percentage of Credit Amount Relative to Client's Income**

In [194]:
train['CREDIT_INCOME_PERCENT'] = train['AMT_CREDIT'] / train['AMT_INCOME_TOTAL']

## **Percentage of the Loan Annuity Relative to a Client's Income**

In [195]:
train['ANNUITY_INCOME_PERCENT'] = train['AMT_ANNUITY'] / train['AMT_INCOME_TOTAL']

## **Length of the Payment in Months**

In [196]:
train['CREDIT_TERM'] = train['AMT_ANNUITY'] / train['AMT_CREDIT'] 

## **Percentage of the Days Employed Relative to the Client's Age**

In [197]:
train['YEARS_EMPLOYED_PERCENT'] = train['YEARS_EMPLOYED'] / train['Age']

# **Missing Values**

In [160]:
functions.MissingValues(train)

Unnamed: 0,NumberMissing,PercentageMissing,DataType
COMMONAREA_MEDI,214862,69.87,float16
COMMONAREA_MODE,214862,69.87,float16
COMMONAREA_AVG,214862,69.87,float16
NONLIVINGAPARTMENTS_MODE,213512,69.43,float16
NONLIVINGAPARTMENTS_MEDI,213512,69.43,float16
NONLIVINGAPARTMENTS_AVG,213512,69.43,float16
FONDKAPREMONT_MODE,210293,68.39,object
LIVINGAPARTMENTS_MEDI,210197,68.36,float16
LIVINGAPARTMENTS_MODE,210197,68.36,float16
LIVINGAPARTMENTS_AVG,210197,68.36,float16


## **Imputation**

In [198]:
ani = ArbitraryNumberImputer(arbitrary_number=-99999)
ani.fit(train)
train = ani.transform(train)

In [199]:
ci = CategoricalImputer(imputation_method='missing', fill_value='UNKNOWN')
ci.fit(train)
train = ci.transform(train)

## **WoE Encoder**

In [200]:
woe = WoEEncoder(fill_value=0.0001)
woe.fit(train, train[target])
train = woe.transform(train)

## **Train Test Split**

In [201]:
X = train.drop('TARGET', axis=1)
y = train['TARGET']

X, y = shuffle(X, y, random_state=random_state)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=random_state)

### **LGBM**

In [202]:
model = lgb.LGBMClassifier(boosting_type='gbdt', 
                           num_leaves=31, 
                           max_depth=-1, 
                           learning_rate=0.1, 
                           n_estimators=100,
                           verbose=-1)

model.fit(X_train, y_train)

y_prob = model.predict_proba(X_test)[:, 1]  

auc_score = roc_auc_score(y_test, y_prob)
print(f"AUC Score: {auc_score:.2f}")

AUC Score: 0.76


## **Feature Importance**

In [None]:
feature_importance = model.feature_importances_
feature_names = X.columns

importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

importance_df

## **Keep Columns**

In [203]:
columns = ['CREDIT_TERM',
                   'EXT_SOURCE_3',
                   'EXT_SOURCE_1',
                   'EXT_SOURCE_2',
                   'YEARS_ID_PUBLISH',
                   'AMT_ANNUITY', 
                   'AMT_GOODS_PRICE', 
                   'YEARS_EMPLOYED_PERCENT',
                   'YEARS_EMPLOYED',
                   'ORGANIZATION_TYPE',
                   'AMT_CREDIT',
                   'YEARS_REGISTRATION',
                   'YEARS_LAST_PHONE_CHANGE',
                   'ANNUITY_INCOME_PERCENT',
                   'CREDIT_INCOME_PERCENT',
                   'OCCUPATION_TYPE',
                   'OWN_CAR_AGE',
                   'CODE_GENDER',
                   'REGION_POPULATION_RELATIVE',
                   'TARGET'] 
train = train[columns]

In [204]:
train.head()

Unnamed: 0,CREDIT_TERM,EXT_SOURCE_3,EXT_SOURCE_1,EXT_SOURCE_2,YEARS_ID_PUBLISH,AMT_ANNUITY,AMT_GOODS_PRICE,YEARS_EMPLOYED_PERCENT,YEARS_EMPLOYED,ORGANIZATION_TYPE,AMT_CREDIT,YEARS_REGISTRATION,YEARS_LAST_PHONE_CHANGE,ANNUITY_INCOME_PERCENT,CREDIT_INCOME_PERCENT,OCCUPATION_TYPE,OWN_CAR_AGE,CODE_GENDER,REGION_POPULATION_RELATIVE,TARGET
0,0.060749,0.139404,0.083008,0.262939,5.808219,24700.5,351000.0,0.067329,1.745205,0.154884,406597.5,9.992188,3.107422,0.121978,2.007889,0.297962,-inf,0.250917,0.018799,1
1,0.027598,-inf,0.311279,0.62207,0.79726,35698.5,1129500.0,0.070862,3.254795,-0.334278,1293502.5,3.25,2.267578,0.132217,4.79075,-0.266369,-inf,-0.154321,0.003542,0
2,0.05,0.729492,-inf,0.556152,6.934247,6750.0,135000.0,0.011814,0.616438,-0.157589,135000.0,11.671875,2.232422,0.1,2.0,0.297962,26.0,0.250917,0.010033,0
3,0.094941,-inf,-inf,0.650391,6.676712,29686.5,297000.0,0.159905,8.326027,0.154884,312682.5,26.9375,1.69043,0.2199,2.316167,0.297962,-inf,-0.154321,0.008018,0
4,0.042623,-inf,-inf,0.322754,9.473973,21865.5,513000.0,0.152418,8.323288,-0.340117,513000.0,11.8125,3.029297,0.179963,4.222222,-0.266369,-inf,0.250917,0.028656,0


### **LGBM**

In [205]:
X = train.drop('TARGET', axis=1)
y = train['TARGET']

X, y = shuffle(X, y, random_state=random_state)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=random_state)

In [206]:
model = lgb.LGBMClassifier(boosting_type='gbdt', 
                           num_leaves=31, 
                           max_depth=-1, 
                           learning_rate=0.1, 
                           n_estimators=100,
                           verbose=-1)

model.fit(X_train, y_train)

y_prob = model.predict_proba(X_test)[:, 1]  

auc_score = roc_auc_score(y_test, y_prob)
print(f"AUC Score: {auc_score:.2f}")

AUC Score: 0.76


## **Feature Importance**

In [207]:
# Get feature importance and sort by importance
feature_importance = model.feature_importances_
feature_names = X.columns

# Create a DataFrame to hold feature importance values
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

importance_df

Unnamed: 0,Feature,Importance
0,CREDIT_TERM,460
1,EXT_SOURCE_3,271
3,EXT_SOURCE_2,248
2,EXT_SOURCE_1,218
4,YEARS_ID_PUBLISH,177
5,AMT_ANNUITY,156
6,AMT_GOODS_PRICE,148
11,YEARS_REGISTRATION,147
12,YEARS_LAST_PHONE_CHANGE,132
13,ANNUITY_INCOME_PERCENT,132
