# **Libraries**

In [1]:
import pandas as pd
import numpy as np

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

import lightgbm as lgb
from sklearn.metrics import roc_auc_score

from feature_engine.encoding import WoEEncoder
from feature_engine.imputation import CategoricalImputer
from feature_engine.imputation import ArbitraryNumberImputer

import functions
import importlib
importlib.reload(functions)

import warnings

# **Display**

In [2]:
%matplotlib inline

pd.options.display.max_rows = 300000
pd.options.display.max_columns = 999
pd.options.display.max_colwidth = 500

warnings.filterwarnings("ignore")
warnings.simplefilter(action="ignore", category=FutureWarning)

pd.set_option('display.max_rows', 200)

size = 20

## **Load Data**

In [32]:
train = pd.read_csv(
    r"C:\Users\Dell\Documents\AI\Risk\Data\Data\train.csv",
    index_col=False
)

bureau = pd.read_csv(
    r"C:\Users\Dell\Documents\AI\Risk\Data\bureau.csv",
    index_col=False
)

balance = pd.read_csv(
    r"C:\Users\Dell\Documents\AI\Risk\Data\bureau_balance.csv",
    index_col=False
)

In [33]:
train.head()

Unnamed: 0,SK_ID_CURR,ANNUITY_TO_CREDIT_RATIO,EXT_SOURCE_3,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_MEAN,ANNUAL_PAYMENT_TO_CREDIT_RATIO,AGE,YEARS_ID_PUBLISH,AMT_ANNUITY,AMT_GOODS_PRICE,ANNUITY_TO_INCOME_RATIO,YEARS_LAST_PHONE_CHANGE,YEARS_EMPLOYED_AGE_PRODUCT,TARGET
0,100002,0.06076,0.1394,0.083,0.263,0.1617,0.729,25.92,5.81,24700.5,351000.0,0.12195,3.107,45.25,1
1,100003,0.0276,-inf,0.3113,0.622,0.4668,0.3313,45.94,0.7974,35698.5,1129500.0,0.1322,2.268,149.5,0
2,100004,0.05,0.7295,-inf,0.556,0.6426,0.6,52.2,6.934,6750.0,135000.0,0.1,2.232,32.16,0
3,100006,0.095,-inf,-inf,0.6504,0.6504,1.14,52.06,6.676,29686.5,297000.0,0.2198,1.69,433.5,0
4,100007,0.04263,-inf,-inf,0.3228,0.3228,0.511,54.6,9.48,21865.5,513000.0,0.1799,3.03,454.2,0


## **Reduce Memory Usage**

In [34]:
train = functions.reduce_memory_usage(train)

Memory usage of dataframe is 35.19 MB
Memory usage after optimization is: 26.10 MB
Decreased by 25.8%


In [35]:
bureau = functions.reduce_memory_usage(bureau)

Memory usage of dataframe is 222.62 MB
Memory usage after optimization is: 112.95 MB
Decreased by 49.3%


In [36]:
balance = functions.reduce_memory_usage(balance)

Memory usage of dataframe is 624.85 MB
Memory usage after optimization is: 338.46 MB
Decreased by 45.8%


## **Imputation**

In [37]:
ani = ArbitraryNumberImputer(arbitrary_number=-99999)
ani.fit(train)
train = ani.transform(train)

In [38]:
ani = ArbitraryNumberImputer(arbitrary_number=-99999)
ani.fit(bureau)
bureau = ani.transform(bureau)

In [39]:
ani = ArbitraryNumberImputer(arbitrary_number=-99999)
ani.fit(balance)
balance = ani.transform(balance)

In [41]:
ci = CategoricalImputer(imputation_method='missing', fill_value='UNKNOWN')
ci.fit(bureau)
bureau = ci.transform(bureau)

In [42]:
ci = CategoricalImputer(imputation_method='missing', fill_value='UNKNOWN')
ci.fit(balance)
balance = ci.transform(balance)

## **Aggregation**

In [43]:
balance_agg = balance.groupby('SK_ID_BUREAU').agg(
    NUM_MONTHS=('MONTHS_BALANCE', 'count'),
    SUM_STATUSES=('STATUS', lambda x: (x.isin(['1', '2', '3', '4', '5'])).sum()),
    MAX_DPD=('STATUS', lambda x: x.replace({'C': -1, 'X': -1}).astype(int).max()),
    NUM_CLOSED=('STATUS', lambda x: (x == 'C').sum()),
    NUM_UNKNOWN=('STATUS', lambda x: (x == 'X').sum()),
)

bureau_merge = bureau.merge(balance_agg, on='SK_ID_BUREAU', how='left')

bureau_balance_agg = bureau_merge.groupby('SK_ID_CURR').agg(
    NUM_LOANS=('SK_ID_BUREAU', 'count'),
    TOTAL_NUM_MONTHS=('NUM_MONTHS', 'sum'),
    TOTAL_SUM_STATUSES_=('SUM_STATUSES', 'sum'),
    AVG_MAX_DPD=('MAX_DPD', 'mean'),
    TOTAL_NUM_CLOSED=('NUM_CLOSED', 'sum'),
    TOTAL_NUM_UNKNOWN=('NUM_UNKNOWN', 'sum'),
    NUM_ACTIVE_LOANS=('CREDIT_ACTIVE', lambda x: (x == 'Active').sum()),
    TOTAL_DEBIT=('AMT_CREDIT_SUM_DEBT', 'sum'),
    TOTAL_CREDIT_AMT=('AMT_CREDIT_SUM', 'sum'),
    DEBT_CREDIT_RATIO=('AMT_CREDIT_SUM_DEBT', lambda x: x.sum() / (bureau.loc[x.index, 'AMT_CREDIT_SUM'].sum() + 1e-5)),
    TOTAL_OVERDUE=('AMT_CREDIT_SUM_OVERDUE', 'sum'),
    MAX_OVERDUE=('AMT_CREDIT_MAX_OVERDUE', 'max'),
    AVG_DAYS_OVERDUE=('CREDIT_DAY_OVERDUE', 'mean'),
    NUM_PROLONGED_LOANS=('CNT_CREDIT_PROLONG', lambda x: (x > 0).sum()),
)

bureau_balance_agg.head()

Unnamed: 0_level_0,NUM_LOANS,TOTAL_NUM_MONTHS,TOTAL_SUM_STATUSES_,AVG_MAX_DPD,TOTAL_NUM_CLOSED,TOTAL_NUM_UNKNOWN,NUM_ACTIVE_LOANS,TOTAL_DEBIT,TOTAL_CREDIT_AMT,DEBT_CREDIT_RATIO,TOTAL_OVERDUE,MAX_OVERDUE,AVG_DAYS_OVERDUE,NUM_PROLONGED_LOANS
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
100001,7,172.0,1.0,0.142857,110.0,30.0,3,596686.5,1453365.0,0.410555,0.0,-99999.0,0.0,0
100002,8,110.0,27.0,0.75,23.0,15.0,2,-54216.0,865055.6,-0.062673,0.0,5043.64502,0.0,0
100003,4,0.0,0.0,,0.0,0.0,1,0.0,1017400.0,0.0,0.0,0.0,0.0,0
100004,2,0.0,0.0,,0.0,0.0,0,0.0,189037.8,0.0,0.0,0.0,0.0,0
100005,3,21.0,0.0,0.0,5.0,2.0,2,568408.5,657126.0,0.864992,0.0,0.0,0.0,0


## **Merge**

In [44]:
train = train.merge(bureau_balance_agg, on='SK_ID_CURR', how='left')

## **Train Test Split**

In [49]:
X = train.drop('TARGET', axis=1)
y = train['TARGET']

X, y = shuffle(X, y, random_state=random_state)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=random_state)

### **LGBM**

In [50]:
model = lgb.LGBMClassifier(boosting_type='gbdt', 
                           num_leaves=31, 
                           max_depth=-1, 
                           learning_rate=0.1, 
                           n_estimators=100,
                           verbose=-1)

model.fit(X_train, y_train)

y_prob = model.predict_proba(X_test)[:, 1]  

auc_score = roc_auc_score(y_test, y_prob)
print(f"AUC Score: {auc_score:.2f}")

AUC Score: 0.76


## **Feature Importance**

In [51]:
feature_importance = model.feature_importances_
feature_names = X.columns

importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

importance_df

Unnamed: 0,Feature,Importance
1,ANNUITY_TO_CREDIT_RATIO,313
7,AGE,223
5,EXT_SOURCE_MEAN,181
6,ANNUAL_PAYMENT_TO_CREDIT_RATIO,172
9,AMT_ANNUITY,163
13,YEARS_EMPLOYED_AGE_PRODUCT,162
22,TOTAL_CREDIT_AMT,159
11,ANNUITY_TO_INCOME_RATIO,145
2,EXT_SOURCE_3,141
4,EXT_SOURCE_2,141


## **Drop Columns**

In [52]:
columns = ['MAX_OVERDUE',
           'NUM_ACTIVE_LOANS',
           'YEARS_LAST_PHONE_CHANGE',
           'TOTAL_DEBIT',
           'NUM_LOANS',
           'AVG_MAX_DPD',
           'TOTAL_NUM_MONTHS',
           'TOTAL_NUM_CLOSED',
           'TOTAL_NUM_UNKNOWN',
           'TOTAL_SUM_STATUSES_',
           'TOTAL_OVERDUE',
           'AVG_DAYS_OVERDUE',
           'NUM_PROLONGED_LOANS'

                ] 
train = train.drop(columns=columns)

### **LGBM**

In [53]:
X = train.drop('TARGET', axis=1)
y = train['TARGET']

X, y = shuffle(X, y, random_state=random_state)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=random_state)

In [54]:
model = lgb.LGBMClassifier(boosting_type='gbdt', 
                           num_leaves=31, 
                           max_depth=-1, 
                           learning_rate=0.1, 
                           n_estimators=100,
                           verbose=-1)

model.fit(X_train, y_train)

y_prob = model.predict_proba(X_test)[:, 1]  

auc_score = roc_auc_score(y_test, y_prob)
print(f"AUC Score: {auc_score:.2f}")

AUC Score: 0.76


## **Feature Importance**

In [55]:
feature_importance = model.feature_importances_
feature_names = X.columns

importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
importance_df = importance_df.sort_values(by='Importance', ascending=False).reset_index(drop=True)

importance_df

Unnamed: 0,Feature,Importance
0,ANNUITY_TO_CREDIT_RATIO,381
1,AGE,248
2,TOTAL_CREDIT_AMT,238
3,EXT_SOURCE_MEAN,223
4,DEBT_CREDIT_RATIO,205
5,ANNUAL_PAYMENT_TO_CREDIT_RATIO,203
6,EXT_SOURCE_2,196
7,AMT_ANNUITY,188
8,ANNUITY_TO_INCOME_RATIO,180
9,YEARS_EMPLOYED_AGE_PRODUCT,176


In [12]:
train.head()

Unnamed: 0,SK_ID_CURR,ANNUITY_TO_CREDIT_RATIO,EXT_SOURCE_3,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_MEAN,ANNUAL_PAYMENT_TO_CREDIT_RATIO,AGE,YEARS_ID_PUBLISH,AMT_ANNUITY,AMT_GOODS_PRICE,ANNUITY_TO_INCOME_RATIO,YEARS_LAST_PHONE_CHANGE,YEARS_EMPLOYED_AGE_PRODUCT,TARGET
0,100002,0.06076,0.1394,0.083,0.263,0.1617,0.729,25.921875,5.808594,24700.5,351000.0,0.12195,3.107,45.25,1
1,100003,0.0276,-inf,0.3113,0.622,0.4668,0.3313,45.9375,0.797363,35698.5,1129500.0,0.1322,2.268,149.5,0
2,100004,0.05,0.7295,-inf,0.556,0.6426,0.6,52.1875,6.933594,6750.0,135000.0,0.1,2.232,32.16,0
3,100006,0.095,-inf,-inf,0.6504,0.6504,1.14,52.0625,6.675781,29686.5,297000.0,0.2198,1.69,433.5,0
4,100007,0.04263,-inf,-inf,0.3228,0.3228,0.511,54.59375,9.476562,21865.5,513000.0,0.1799,3.03,454.2,0
