# **Libraries**

In [71]:
import pandas as pd
import numpy as np

from feature_engine.imputation import AddMissingIndicator
from feature_engine.imputation import RandomSampleImputer
from feature_engine.encoding import WoEEncoder

from xgboost import XGBRegressor

import functions
import importlib
importlib.reload(functions)

import warnings

# **Display**

In [72]:
%matplotlib inline

pd.options.display.max_rows = 300000
pd.options.display.max_columns = 999
pd.options.display.max_colwidth = 500

warnings.filterwarnings("ignore")
warnings.simplefilter(action="ignore", category=FutureWarning)

pd.set_option('display.max_rows', 200)

size = 20

# **Variables**

In [73]:
random_state = 101
target = 'TARGET'

# **Data**

## **Load Data**

In [101]:
app_train = pd.read_csv(
    r"C:\Users\Dell\Documents\AI\Risk\Data\application_train.csv",
    index_col=False
)


app_test = pd.read_csv(
    r"C:\Users\Dell\Documents\AI\Risk\Data\application_test.csv",
    index_col=False
)

# **Data Cleaning**

## **Removing Empty Features**

In [86]:
list_columns = functions.check_columns_with_one_uniquevalue(app_test)
    
print(f"There are {len(list_columns)} columns with just 1 unique value")

There are 11 columns with just 1 unique value


In [87]:
app_trian = app_train.drop(list_columns, axis = 1)
app_test = app_test.drop(list_columns, axis = 1)

## **Reduce Memory Usage**

In [88]:
app_trian= functions.reduce_memory_usage(app_trian)

Memory usage of dataframe is 260.42 MB
Memory usage after optimization is: 89.15 MB
Decreased by 65.8%


### **Correlation between each Feature and the Target Feature**

In [None]:
corr_matrix = app_trian.corr(numeric_only=True)
corr_matrix[target].sort_values(ascending=False)

## **External Source Features**

In [89]:
data = app_trian.copy()

selected_columns = ['SK_ID_CURR','EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']

ext_source = data [selected_columns]
data.drop(['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3'], axis=1, inplace=True)

# **Missing Values**

In [90]:
data.replace([np.inf, -np.inf], np.nan, inplace=True)

In [None]:
functions.MissingValues(data)

In [91]:
num_value = -99999

if (data == num_value ).any().any():
    print(f"The value {num_value} exists in the DataFrame.")
else:
    print(f"The value {num_value} does not exist in the DataFrame.")

The value -99999 does not exist in the DataFrame.


In [92]:
for col in data.select_dtypes(include=['float16', 'float32', 'float64']).columns:
    data[col].fillna(num_value, inplace=True)

In [93]:
cat_value = 'UNKNOWN'

if (app_train == cat_value).any().any():
    print(f"The value {cat_value} exists in the DataFrame.")
else:
    print(f"The value {cat_value} does not exist in the DataFrame.")

The value UNKNOWN does not exist in the DataFrame.


In [94]:
for col in data.select_dtypes(include=['object']).columns:
    data[col].fillna(cat_value, inplace=True)

## **WoE Encoder**

In [95]:
woe = WoEEncoder(fill_value=0.0001)
woe.fit(data, data[target])
data = woe.transform(data)

## **Merge EXT Source Data Back**

In [96]:
data = ext_source.merge(data, how='inner')

In [None]:
df = data


def impute_column(df, target_column):
   
    df_missing = df[df[target_column].isna()]  
    df_no_missing = df.dropna(subset=[target_column])  
    
    if df_missing.empty: 
        return df

    X_train = df_no_missing.drop(columns=[target_column])  
    y_train = df_no_missing[target_column] 

    X_to_predict = df_missing.drop(columns=[target_column])  

    xgb_reg = XGBRegressor(enable_categorical=True, random_state=random_state)
    xgb_reg.fit(X_train, y_train)

    predicted_values = xgb_reg.predict(X_to_predict)

    df.loc[df[target_column].isna(), target_column] = predicted_values
    
    return df


columns_to_impute = ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']

for col in columns_to_impute:
    df = impute_column(df, col)

print("DataFrame with predicted missing values:")
df.head()

In [None]:
df = data

df.replace([np.inf, -np.inf], np.nan, inplace=True)  # Replace inf values
df.fillna(df.mean(), inplace=True)  # Impute missing values with the mean of the column

# Now you can proceed with the model
def impute_column(df, target_column):
    # Separate the rows with missing values and rows without missing values in the target column
    df_missing = df[df[target_column].isna()]  # Rows where the target_column is NaN
    df_no_missing = df.dropna(subset=[target_column])  # Rows where target_column is not NaN
    
    if df_missing.empty:  # If there are no missing values to impute
        return df

    # Features (X) and target (y)
    X_train = df_no_missing.drop(columns=[target_column])  # Features without missing target
    y_train = df_no_missing[target_column]  # Target without missing values

    X_to_predict = df_missing.drop(columns=[target_column])  # Features where the target is NaN

    # Train the XGBRegressor model
    xgb_reg = XGBRegressor()
    xgb_reg.fit(X_train, y_train)

    # Predict missing values
    predicted_values = xgb_reg.predict(X_to_predict)

    # Fill missing values in the original DataFrame
    df.loc[df[target_column].isna(), target_column] = predicted_values
    
    return df

# Impute missing values iteratively for each column
columns_to_impute = ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']

for col in columns_to_impute:
    df = impute_column(df, col)

print("DataFrame with predicted missing values:")
df.head()


In [100]:
df.head()

Unnamed: 0,SK_ID_CURR,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,APARTMENTS_MODE,BASEMENTAREA_MODE,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_MODE,COMMONAREA_MODE,ELEVATORS_MODE,ENTRANCES_MODE,FLOORSMAX_MODE,FLOORSMIN_MODE,LANDAREA_MODE,LIVINGAPARTMENTS_MODE,LIVINGAREA_MODE,NONLIVINGAPARTMENTS_MODE,NONLIVINGAREA_MODE,APARTMENTS_MEDI,BASEMENTAREA_MEDI,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BUILD_MEDI,COMMONAREA_MEDI,ELEVATORS_MEDI,ENTRANCES_MEDI,FLOORSMAX_MEDI,FLOORSMIN_MEDI,LANDAREA_MEDI,LIVINGAPARTMENTS_MEDI,LIVINGAREA_MEDI,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAREA_MEDI,FONDKAPREMONT_MODE,HOUSETYPE_MODE,TOTALAREA_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_11,FLAG_DOCUMENT_18,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,0.083008,0.262939,0.139404,1,0.036236,0.250931,0.056242,-0.015093,0,202500.0,406597.5,24700.5,351000.0,0.014753,0.188675,0.111494,0.213706,-0.037947,0.018799,-9461,-637,-3648.0,-2120,,1,1,0,1,1,0,0.297977,1.0,2,2,0.011729,10,0,0,0,0,0,0,0.154898,0.024704,0.036896,0.972168,0.619141,0.014297,0.0,0.06897,0.083313,0.125,0.036896,0.020203,0.018997,0.0,0.0,0.025208,0.0383,0.972168,0.634277,0.014397,0.0,0.06897,0.083313,0.125,0.037689,0.022003,0.019806,0.0,0.0,0.024994,0.036896,0.972168,0.624512,0.014397,0.0,0.06897,0.083313,0.125,0.037506,0.020493,0.019302,0.0,0.0,-0.157558,-0.162933,0.0149,-0.093493,-0.159608,2.0,2.0,2.0,2.0,-1134.0,1,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0.311279,0.62207,0.387021,0,0.036236,-0.154307,0.056242,0.03349,0,270000.0,1293502.5,35698.5,1129500.0,-0.080601,-0.36335,-0.439594,-0.071222,-0.037947,0.003542,-16765,-1188,-1186.0,-291,,1,1,0,1,1,0,-0.266393,2.0,1,1,-0.043314,11,0,0,0,0,0,0,-0.334264,0.095886,0.052887,0.984863,0.795898,0.060486,0.080017,0.034485,0.291748,0.333252,0.013,0.077271,0.054901,0.003901,0.009804,0.092407,0.053802,0.984863,0.804199,0.049713,0.080627,0.034485,0.291748,0.333252,0.012802,0.078979,0.055389,0.0,0.0,0.096802,0.052887,0.984863,0.798828,0.060791,0.080017,0.034485,0.291748,0.333252,0.013199,0.078674,0.055786,0.003901,0.010002,-0.157558,-0.162933,0.071411,-0.150408,-0.159608,1.0,0.0,1.0,0.0,-828.0,1,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0.537329,0.556152,0.729492,0,-0.415543,0.250931,-0.117353,-0.015093,0,67500.0,135000.0,6750.0,135000.0,0.014753,0.188675,0.111494,0.213706,-0.037947,0.010033,-19046,-225,-4260.0,-2531,26.0,1,1,1,1,1,0,0.297977,1.0,2,2,-0.043314,9,0,0,0,0,0,0,-0.157575,0.0,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.071397,0.137172,0.0,0.134395,0.15028,0.0,0.0,0.0,0.0,-815.0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0.71327,0.650391,0.384476,0,0.036236,-0.154307,0.056242,-0.015093,0,135000.0,312682.5,29686.5,297000.0,0.014753,0.188675,0.111494,0.229088,-0.037947,0.008018,-19005,-3039,-9832.0,-2437,,1,1,0,1,0,0,0.297977,2.0,2,2,0.011729,17,0,0,0,0,0,0,0.154898,0.0,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.071397,0.137172,0.0,0.134395,0.15028,2.0,0.0,2.0,0.0,-617.0,1,0,0,0,0,0,0,0,0,0.0,0.0,0.0,,,
4,100007,0.597476,0.322754,0.599955,0,0.036236,0.250931,0.056242,-0.015093,0,121500.0,513000.0,21865.5,513000.0,0.014753,0.188675,0.111494,0.213706,-0.037947,0.028656,-19932,-3038,-4312.0,-3458,,1,1,0,1,0,0,-0.266393,1.0,2,2,0.003683,11,0,0,0,0,1,1,-0.340103,0.0,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.071397,0.137172,0.0,0.134395,0.15028,0.0,0.0,0.0,0.0,-1106.0,0,0,0,0,0,1,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


### **Correlation between each Feature and the Target Feature**

In [104]:
corr_matrix = data.corr(numeric_only=True)
corr_matrix[target].sort_values(ascending=False)

TARGET                          1.000000
OCCUPATION_TYPE                 0.079664
DAYS_BIRTH                      0.078239
ORGANIZATION_TYPE               0.071869
NAME_INCOME_TYPE                0.063318
REGION_RATING_CLIENT_W_CITY     0.060893
REGION_RATING_CLIENT            0.058899
NAME_EDUCATION_TYPE             0.057441
DAYS_LAST_PHONE_CHANGE          0.055219
CODE_GENDER                     0.054633
DAYS_ID_PUBLISH                 0.051457
REG_CITY_NOT_WORK_CITY          0.050994
FLAG_EMP_PHONE                  0.045982
REG_CITY_NOT_LIVE_CITY          0.044395
FLAG_DOCUMENT_3                 0.044346
WALLSMATERIAL_MODE              0.044065
EMERGENCYSTATE_MODE             0.042213
DAYS_REGISTRATION               0.041975
HOUSETYPE_MODE                  0.040699
NAME_FAMILY_STATUS              0.040291
OWN_CAR_AGE                     0.037612
NAME_HOUSING_TYPE               0.036947
LIVE_CITY_NOT_WORK_CITY         0.032518
DEF_30_CNT_SOCIAL_CIRCLE        0.032394
DEF_60_CNT_SOCIA