# **Libraries**

In [1]:
import pandas as pd
import numpy as np

from feature_engine.imputation import CategoricalImputer
from feature_engine.imputation import ArbitraryNumberImputer

import functions
import importlib
importlib.reload(functions)

import warnings

# **Display**

In [2]:
%matplotlib inline

pd.options.display.max_rows = 300000
pd.options.display.max_columns = 999
pd.options.display.max_colwidth = 500

warnings.filterwarnings("ignore")
warnings.simplefilter(action="ignore", category=FutureWarning)

pd.set_option('display.max_rows', 200)

size = 20

# **Data**

## **Load Data**

In [3]:
install = pd.read_csv(
    r"C:\Users\Dell\Documents\AI\Risk\Data\installments_payments.csv",
    index_col=False)

In [8]:
install.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
0,1054186,161674,1.0,6,-1180.0,-1187.0,6948.359863,6948.359863
1,1330831,151639,0.0,34,-2156.0,-2156.0,1716.525024,1716.525024
2,2085231,193053,2.0,1,-63.0,-63.0,25425.0,25425.0
3,2452527,199697,1.0,3,-2418.0,-2426.0,24350.130859,24350.130859
4,2714724,167756,1.0,2,-1383.0,-1366.0,2165.040039,2160.584961


## **Reduce Memory Usage**

In [4]:
install_pay = functions.reduce_memory_usage(install)

Memory usage of dataframe is 830.41 MB
Memory usage after optimization is: 311.40 MB
Decreased by 62.5%


## **Missing Values**

In [5]:
functions.MissingValues(install)

Unnamed: 0,NumberMissing,PercentageMissing,DataType
DAYS_ENTRY_PAYMENT,2905,0.02,float16
AMT_PAYMENT,2905,0.02,float32


## **Imputation**

In [6]:
ani = ArbitraryNumberImputer(arbitrary_number=-99999)
ani.fit(install)
install = ani.transform(install)

## **Aggregation**

In [9]:
install = install.groupby('SK_ID_CURR').agg({

    'SK_ID_PREV': 'count',  
    'AMT_INSTALMENT': ['sum', 'mean'],   
    'AMT_PAYMENT': ['sum', 'mean', 'max', 'min'],  

}).reset_index()


aggregated_install_pay['SUM_AMT_PAYMENT/SUM_AMT_INSTALMENT'] = aggregated_install_pay[('AMT_PAYMENT', 'sum')] / aggregated_install_pay[('AMT_INSTALMENT', 'sum')]
aggregated_install_pay['MEAN_AMT_PAYMENT-MEAN_AMT_INSTALMENT'] = aggregated_install_pay[('AMT_PAYMENT', 'mean')] - aggregated_install_pay[('AMT_INSTALMENT', 'mean')]


aggregated_install_pay.columns = ['_'.join(col).strip() if type(col) is tuple else col for col in aggregated_install_pay.columns]

aggregated_install_pay = aggregated_install_pay.rename(columns={

    'SK_ID_CURR_': 'SK_ID_CURR', 
    'SK_ID_PREV_count': 'NUM_PREVIOUS_APPLICATIONS',
    'AMT_INSTALMENT_sum': 'SUM_AMT_INSTALMENT',
    'AMT_INSTALMENT_mean': 'AVG_AMT_INSTALMENT',
    'AMT_PAYMENT_sum': 'SUM_AMT_PAYMENT',
    'AMT_PAYMENT_mean': 'AVG_AMT_PAYMENT',
    'AMT_PAYMENT_max': 'MAX_AMT_PAYMENT',
    'AMT_PAYMENT_min': 'MIN_AMT_PAYMENT',
    'SUM_AMT_PAYMENT/SUM_AMT_INSTALMENT_': 'SUM_AMT_PAYMENT/SUM_AMT_INSTALMENT',
    'MEAN_AMT_PAYMENT-MEAN_AMT_INSTALMENT_': 'MEAN_AMT_PAYMENT-MEAN_AMT_INSTALMENT'

})

Unnamed: 0,SK_ID_CURR,NUM_PREVIOUS_APPLICATIONS,SUM_AMT_INSTALMENT,AVG_AMT_INSTALMENT,SUM_AMT_PAYMENT,AVG_AMT_PAYMENT,MAX_AMT_PAYMENT,MIN_AMT_PAYMENT,SUM_AMT_PAYMENT/SUM_AMT_INSTALMENT,MEAN_AMT_PAYMENT-MEAN_AMT_INSTALMENT
0,100001,7,41195.93,5885.132324,41195.93,5885.132324,17397.900391,3951.0,1.0,0.0
1,100002,19,219625.7,11559.24707,219625.7,11559.24707,53093.746094,9251.775391,1.0,0.0
2,100003,25,1618865.0,64754.585938,1618865.0,64754.585938,560835.375,6662.970215,1.0,0.0
3,100004,3,21288.46,7096.154785,21288.46,7096.154785,10573.964844,5357.25,1.0,0.0
4,100005,9,56161.84,6240.205078,56161.84,6240.205078,17656.244141,4813.200195,1.0,0.0


In [None]:
aggregated_install_pay.head()

## **Merge Aggregated Installments and Application Train**

In [80]:
data = app_train.merge(aggregated_install_pay, on='SK_ID_CURR', how='inner')

In [81]:
data.shape

(291643, 131)

In [82]:
data.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,APARTMENTS_MODE,BASEMENTAREA_MODE,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_MODE,COMMONAREA_MODE,ELEVATORS_MODE,ENTRANCES_MODE,FLOORSMAX_MODE,FLOORSMIN_MODE,LANDAREA_MODE,LIVINGAPARTMENTS_MODE,LIVINGAREA_MODE,NONLIVINGAPARTMENTS_MODE,NONLIVINGAREA_MODE,APARTMENTS_MEDI,BASEMENTAREA_MEDI,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BUILD_MEDI,COMMONAREA_MEDI,ELEVATORS_MEDI,ENTRANCES_MEDI,FLOORSMAX_MEDI,FLOORSMIN_MEDI,LANDAREA_MEDI,LIVINGAPARTMENTS_MEDI,LIVINGAREA_MEDI,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAREA_MEDI,FONDKAPREMONT_MODE,HOUSETYPE_MODE,TOTALAREA_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,NUM_PREVIOUS_APPLICATIONS,SUM_AMT_INSTALMENT,AVG_AMT_INSTALMENT,SUM_AMT_PAYMENT,AVG_AMT_PAYMENT,MAX_AMT_PAYMENT,MIN_AMT_PAYMENT,SUM_AMT_PAYMENT/SUM_AMT_INSTALMENT,MEAN_AMT_PAYMENT-MEAN_AMT_INSTALMENT
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,351000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.018799,-9461,-637,-3648.0,-2120,-inf,1,1,0,1,1,0,Laborers,1.0,2,2,WEDNESDAY,10,0,0,0,0,0,0,Business Entity Type 3,0.083008,0.262939,0.139404,0.024704,0.036896,0.972168,0.619141,0.014297,0.0,0.06897,0.083313,0.125,0.036896,0.020203,0.018997,0.0,0.0,0.025208,0.0383,0.972168,0.634277,0.014397,0.0,0.06897,0.083313,0.125,0.037689,0.022003,0.019806,0.0,0.0,0.024994,0.036896,0.972168,0.624512,0.014397,0.0,0.06897,0.083313,0.125,0.037506,0.020493,0.019302,0.0,0.0,reg oper account,block of flats,0.0149,"Stone, brick",No,2.0,2.0,2.0,2.0,-1134.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0,19,219625.7,11559.24707,219625.7,11559.24707,53093.746094,9251.775391,1.0,0.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,1129500.0,Family,State servant,Higher education,Married,House / apartment,0.003542,-16765,-1188,-1186.0,-291,-inf,1,1,0,1,1,0,Core staff,2.0,1,1,MONDAY,11,0,0,0,0,0,0,School,0.311279,0.62207,-inf,0.095886,0.052887,0.984863,0.795898,0.060486,0.080017,0.034485,0.291748,0.333252,0.013,0.077271,0.054901,0.003901,0.009804,0.092407,0.053802,0.984863,0.804199,0.049713,0.080627,0.034485,0.291748,0.333252,0.012802,0.078979,0.055389,0.0,0.0,0.096802,0.052887,0.984863,0.798828,0.060791,0.080017,0.034485,0.291748,0.333252,0.013199,0.078674,0.055786,0.003901,0.010002,reg oper account,block of flats,0.071411,Block,No,1.0,0.0,1.0,0.0,-828.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,25,1618865.0,64754.585938,1618865.0,64754.585938,560835.375,6662.970215,1.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,135000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.010033,-19046,-225,-4260.0,-2531,26.0,1,1,1,1,1,0,Laborers,1.0,2,2,MONDAY,9,0,0,0,0,0,0,Government,-inf,0.556152,0.729492,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,UNKNOWN,UNKNOWN,-inf,UNKNOWN,UNKNOWN,0.0,0.0,0.0,0.0,-815.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,3,21288.46,7096.154785,21288.46,7096.154785,10573.964844,5357.25,1.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,297000.0,Unaccompanied,Working,Secondary / secondary special,Civil marriage,House / apartment,0.008018,-19005,-3039,-9832.0,-2437,-inf,1,1,0,1,0,0,Laborers,2.0,2,2,WEDNESDAY,17,0,0,0,0,0,0,Business Entity Type 3,-inf,0.650391,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,UNKNOWN,UNKNOWN,-inf,UNKNOWN,UNKNOWN,2.0,0.0,2.0,0.0,-617.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-inf,-inf,-inf,-inf,-inf,-inf,16,1007153.0,62947.089844,1007153.0,62947.089844,691786.875,2482.919922,1.0,0.0
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,513000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.028656,-19932,-3038,-4312.0,-3458,-inf,1,1,0,1,0,0,Core staff,1.0,2,2,THURSDAY,11,0,0,0,0,1,1,Religion,-inf,0.322754,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,UNKNOWN,UNKNOWN,-inf,UNKNOWN,UNKNOWN,0.0,0.0,0.0,0.0,-1106.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,66,835985.3,12666.444336,806127.9,12214.05957,22678.785156,0.18,0.964285,-452.384766


## **WoE Encoder**

In [85]:
woe = WoEEncoder(fill_value=0.0001)
woe.fit(data, data[target])
data = woe.transform(data)

## **Train Test Split**

In [87]:
X = data.drop('TARGET', axis=1)
y = data['TARGET']

X, y = shuffle(X, y, random_state=random_state)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=random_state)

### **LGBM**

In [88]:
model = lgb.LGBMClassifier(boosting_type='gbdt', 
                           num_leaves=31, 
                           max_depth=-1, 
                           learning_rate=0.1, 
                           n_estimators=100,
                           verbose=-1)

model.fit(X_train, y_train)

y_prob = model.predict_proba(X_test)[:, 1]  

auc_score = roc_auc_score(y_test, y_prob)
print(f"AUC Score: {auc_score:.2f}")

AUC Score: 0.76


## **Feature Importance**

In [89]:
feature_importance = model.feature_importances_
feature_names = X.columns

importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

importance_df

Unnamed: 0,Feature,Importance
42,EXT_SOURCE_3,189
41,EXT_SOURCE_2,169
40,EXT_SOURCE_1,162
16,DAYS_BIRTH,134
8,AMT_ANNUITY,130
7,AMT_CREDIT,128
126,MAX_AMT_PAYMENT,123
127,MIN_AMT_PAYMENT,114
19,DAYS_ID_PUBLISH,92
17,DAYS_EMPLOYED,92
