# **Libraries**

In [1]:
import pandas as pd
import numpy as np

from feature_engine.selection import DropCorrelatedFeatures
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

import lightgbm as lgb
from sklearn.metrics import roc_auc_score

from feature_engine.encoding import WoEEncoder
from feature_engine.imputation import CategoricalImputer
from feature_engine.imputation import ArbitraryNumberImputer

import functions
import importlib
importlib.reload(functions)

import warnings

# **Display**

In [2]:
%matplotlib inline

pd.options.display.max_rows = 300000
pd.options.display.max_columns = 999
pd.options.display.max_colwidth = 500

warnings.filterwarnings("ignore")
warnings.simplefilter(action="ignore", category=FutureWarning)

pd.set_option('display.max_rows', 200)

size = 20

# **Load Data**

In [3]:
train = pd.read_csv(
    r"C:\Users\Dell\Documents\AI\Risk\Data\application_train.csv",
    index_col=False
)


test = pd.read_csv(
    r"C:\Users\Dell\Documents\AI\Risk\Data\application_test.csv",
    index_col=False
)

# **Variables**

In [4]:
random_state = 101
target = 'TARGET'

## **Remove Empty Features**

In [5]:
list_columns = functions.check_columns_with_one_uniquevalue(test)

In [6]:
train = train.drop(list_columns, axis = 1)

## **Reduce Memory Usage**

In [7]:
train = functions.reduce_memory_usage(train)

Memory usage of dataframe is 260.42 MB
Memory usage after optimization is: 89.15 MB
Decreased by 65.8%


# **Data Cleaning**

In [8]:
train = train[train['CODE_GENDER'] != 'XNA']
train['DAYS_EMPLOYED'][train['DAYS_EMPLOYED'] == 365243] = np.nan

# **Feature Engineering**

## **'DAY_BIRTH' to 'AGE' in Years**

In [9]:
train["DAYS_BIRTH"] = abs(train["DAYS_BIRTH"])
train["DAYS_BIRTH"] = train["DAYS_BIRTH"]/365
train.rename(columns={'DAYS_BIRTH': 'AGE'}, inplace=True)
train["AGE"].describe()  

count    307507.000000
mean         43.937061
std          11.956116
min          20.517808
25%          34.008219
50%          43.150685
75%          53.923288
max          69.120548
Name: Age, dtype: float64

## **'DAYS_EMPLOYED' to 'YEARS_EMPLOYED'**

In [10]:
train["DAYS_EMPLOYED"] = abs(train["DAYS_EMPLOYED"])
train["DAYS_EMPLOYED"] = train["DAYS_EMPLOYED"]/365
train.rename(columns={"DAYS_EMPLOYED": 'YEARS_EMPLOYED'}, inplace=True)
train["YEARS_EMPLOYED"].describe()  

count    252133.000000
mean          6.531897
std           6.406377
min           0.000000
25%           2.101370
50%           4.515068
75%           8.698630
max          49.073973
Name: YEARS_EMPLOYED, dtype: float64

## **'DAYS_ID_PUBLISH' to 'YEARS_ID_PUBLISH'**

In [11]:
train["DAYS_ID_PUBLISH"] = abs(train["DAYS_ID_PUBLISH"])
train["DAYS_ID_PUBLISH"] = train["DAYS_ID_PUBLISH"]/365
train.rename(columns={"DAYS_ID_PUBLISH": 'YEARS_ID_PUBLISH'}, inplace=True)
train["YEARS_ID_PUBLISH"].describe()  

count    307507.000000
mean          8.203292
std           4.135492
min           0.000000
25%           4.712329
50%           8.915068
75%          11.778082
max          19.717808
Name: YEARS_ID_PUBLISH, dtype: float64

## **'DAYS_REGISTRATION' to 'YEARS_REGISTRATION'**

In [12]:
train["DAYS_REGISTRATION"] = abs(train["DAYS_REGISTRATION"])
train["DAYS_REGISTRATION"] = train["DAYS_REGISTRATION"]/365
train.rename(columns={"DAYS_REGISTRATION": 'YEARS_REGISTRATION'}, inplace=True)
train["YEARS_REGISTRATION"].describe()  

count    307507.000000
mean               NaN
std           0.000000
min           0.000000
25%           5.507812
50%          12.335938
75%          20.500000
max          67.625000
Name: YEARS_REGISTRATION, dtype: float64

## **'DAYS_LAST_PHONE_CHANGE' to 'YEARS_LAST_PHONE_CHANGE'**

In [13]:
train["DAYS_LAST_PHONE_CHANGE"] = abs(train["DAYS_LAST_PHONE_CHANGE"])
train["DAYS_LAST_PHONE_CHANGE"] = train["DAYS_LAST_PHONE_CHANGE"]/365
train.rename(columns={"DAYS_LAST_PHONE_CHANGE": 'YEARS_LAST_PHONE_CHANGE'}, inplace=True)
train["YEARS_LAST_PHONE_CHANGE"].describe()  

count    307506.000000
mean               NaN
std           0.000000
min           0.000000
25%           0.750488
50%           2.074219
75%           4.300781
max          11.757812
Name: YEARS_LAST_PHONE_CHANGE, dtype: float64

## **Ratio of Credit Amount to Client's Income**

In [14]:
train['CREDIT_INCOME_RATIO'] = train['AMT_CREDIT'] / train['AMT_INCOME_TOTAL']

## **Ratio of the Loan Annuity to a Client's Income**

In [15]:
train['ANNUITY_INCOME_RATIO'] = train['AMT_ANNUITY'] / train['AMT_INCOME_TOTAL']

## **Income per Family Member**

In [None]:
train['INCOME_PER_FAMILY_MEMBER'] = train['AMT_INCOME_TOTAL'] / train['CNT_FAM_MEMBERS']

## **Length of the Payment**

In [16]:
train['CREDIT_TERM'] = train['AMT_ANNUITY'] / train['AMT_CREDIT'] 

## **Ratio of the Days Employed to the Client's Age**

In [17]:
train['YEARS_EMPLOYED_Ratio'] = train['YEARS_EMPLOYED'] / train['AGE']

## **Average of External Sources**

In [None]:
train['EXT_SOURCE_MEAN'] = train[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)

## **Ratio of Dependents to Family Size**

In [None]:
train['DEPENDENTS_TO_FAMILY_SIZE'] = train['CNT_CHILDREN'] / train['CNT_FAM_MEMBERS']

# **Missing Values**

In [None]:
functions.MissingValues(train)

## **Imputation**

In [19]:
ani = ArbitraryNumberImputer(arbitrary_number=-99999)
ani.fit(train)
train = ani.transform(train)

In [20]:
ci = CategoricalImputer(imputation_method='missing', fill_value='UNKNOWN')
ci.fit(train)
train = ci.transform(train)

## **WoE Encoder**

In [21]:
woe = WoEEncoder(fill_value=0.0001)
woe.fit(train, train[target])
train = woe.transform(train)

## **Train Test Split**

In [22]:
X = train.drop('TARGET', axis=1)
y = train['TARGET']

X, y = shuffle(X, y, random_state=random_state)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=random_state)

### **LGBM**

In [23]:
model = lgb.LGBMClassifier(boosting_type='gbdt', 
                           num_leaves=31, 
                           max_depth=-1, 
                           learning_rate=0.1, 
                           n_estimators=100,
                           verbose=-1)

model.fit(X_train, y_train)

y_prob = model.predict_proba(X_test)[:, 1]  

auc_score = roc_auc_score(y_test, y_prob)
print(f"AUC Score: {auc_score:.2f}")

AUC Score: 0.76


## **Feature Importance**

In [24]:
feature_importance = model.feature_importances_
feature_names = X.columns

importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
importance_df = importance_df.sort_values(by='Importance', ascending=False).reset_index(drop=True)

importance_df

Unnamed: 0,Feature,Importance
112,CREDIT_TERM,333
42,EXT_SOURCE_3,202
41,EXT_SOURCE_2,197
40,EXT_SOURCE_1,167
16,Age,160
19,YEARS_ID_PUBLISH,119
8,AMT_ANNUITY,105
9,AMT_GOODS_PRICE,92
113,YEARS_EMPLOYED_PERCENT,80
17,YEARS_EMPLOYED,76


## **Keep Columns**

In [25]:
columns = ['CREDIT_TERM',
                   'EXT_SOURCE_3',
                   'EXT_SOURCE_1',
                   'EXT_SOURCE_2',
                   'YEARS_ID_PUBLISH',
                   'AMT_ANNUITY', 
                   'AMT_GOODS_PRICE', 
                   'YEARS_EMPLOYED_PERCENT',
                   'YEARS_EMPLOYED',
                   'ORGANIZATION_TYPE',
                   'AMT_CREDIT',
                   'YEARS_REGISTRATION',
                   'YEARS_LAST_PHONE_CHANGE',
                   'ANNUITY_INCOME_PERCENT',
                   'CREDIT_INCOME_PERCENT',
                   'OCCUPATION_TYPE',
                   'OWN_CAR_AGE',
                   'CODE_GENDER',
                   'REGION_POPULATION_RELATIVE',
                   'TARGET'] 
train = train[columns]

### **LGBM**

In [27]:
X = train.drop('TARGET', axis=1)
y = train['TARGET']

X, y = shuffle(X, y, random_state=random_state)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=random_state)

In [28]:
model = lgb.LGBMClassifier(boosting_type='gbdt', 
                           num_leaves=31, 
                           max_depth=-1, 
                           learning_rate=0.1, 
                           n_estimators=100,
                           verbose=-1)

model.fit(X_train, y_train)

y_prob = model.predict_proba(X_test)[:, 1]  

auc_score = roc_auc_score(y_test, y_prob)
print(f"AUC Score: {auc_score:.2f}")

AUC Score: 0.76


## **Feature Importance**

In [29]:
feature_importance = model.feature_importances_
feature_names = X.columns

importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
importance_df = importance_df.sort_values(by='Importance', ascending=False).reset_index(drop=True)

importance_df

Unnamed: 0,Feature,Importance
0,CREDIT_TERM,460
1,EXT_SOURCE_3,271
3,EXT_SOURCE_2,248
2,EXT_SOURCE_1,218
4,YEARS_ID_PUBLISH,177
5,AMT_ANNUITY,156
6,AMT_GOODS_PRICE,148
11,YEARS_REGISTRATION,147
12,YEARS_LAST_PHONE_CHANGE,132
13,ANNUITY_INCOME_PERCENT,132


## **Drop Columns**

In [30]:
columns = ['CODE_GENDER'
                ] 
train = train.drop(columns=columns)

### **LGBM**

In [31]:
X = train.drop('TARGET', axis=1)
y = train['TARGET']

X, y = shuffle(X, y, random_state=random_state)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=random_state)

In [32]:
model = lgb.LGBMClassifier(boosting_type='gbdt', 
                           num_leaves=31, 
                           max_depth=-1, 
                           learning_rate=0.1, 
                           n_estimators=100,
                           verbose=-1)

model.fit(X_train, y_train)

y_prob = model.predict_proba(X_test)[:, 1]  

auc_score = roc_auc_score(y_test, y_prob)
print(f"AUC Score: {auc_score:.2f}")

AUC Score: 0.76


## **Feature Importance**

In [34]:
feature_importance = model.feature_importances_
feature_names = X.columns

importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
importance_df = importance_df.sort_values(by='Importance', ascending=False).reset_index(drop=True)

importance_df

Unnamed: 0,Feature,Importance
0,CREDIT_TERM,425
1,EXT_SOURCE_3,292
2,EXT_SOURCE_2,261
3,EXT_SOURCE_1,220
4,YEARS_REGISTRATION,168
5,YEARS_ID_PUBLISH,159
6,YEARS_EMPLOYED,156
7,AMT_GOODS_PRICE,143
8,AMT_CREDIT,143
9,AMT_ANNUITY,140
