# **Libraries**

In [1]:
import pandas as pd
import numpy as np

from feature_engine.imputation import AddMissingIndicator
from feature_engine.imputation import RandomSampleImputer
from feature_engine.encoding import WoEEncoder

from xgboost import XGBRegressor

import functions
import importlib
importlib.reload(functions)

import warnings

# **Display**

In [2]:
%matplotlib inline

pd.options.display.max_rows = 300000
pd.options.display.max_columns = 999
pd.options.display.max_colwidth = 500

warnings.filterwarnings("ignore")
warnings.simplefilter(action="ignore", category=FutureWarning)

pd.set_option('display.max_rows', 200)

size = 20

# **Variables**

In [19]:
random_state = 101
target = 'TARGET'

# **Data**

## **Load Data**

In [3]:
app_train = pd.read_csv(
    r"C:\Users\Dell\Documents\AI\Risk\Data\application_train.csv",
    index_col=False
)


app_test = pd.read_csv(
    r"C:\Users\Dell\Documents\AI\Risk\Data\application_test.csv",
    index_col=False
)

# **Data Cleaning**

## **Removing Empty Features**

In [37]:
list_columns = functions.check_columns_with_one_uniquevalue(app_test)
    
print(f"There are {len(list_columns)} columns with just 1 unique value")

There are 11 columns with just 1 unique value


In [38]:
app_trian = app_train.drop(list_columns, axis = 1)
app_test = app_test.drop(list_columns, axis = 1)

## **Reduce Memory Usage**

In [39]:
app_train = functions.reduce_memory_usage(app_train)

Memory usage of dataframe is 260.42 MB
Memory usage after optimization is: 89.15 MB
Decreased by 65.8%


In [40]:
app_test = functions.reduce_memory_usage(app_test)

Memory usage of dataframe is 40.91 MB
Memory usage after optimization is: 14.09 MB
Decreased by 65.6%


### **Correlation between each Feature and the Target Feature**

In [None]:
corr_matrix = app_trian.corr(numeric_only=True)
corr_matrix[target].sort_values(ascending=False)

## **External Source Features**

### **Predicting with XGB Model**

In [None]:
#using only numeric columns for predicting the EXT_SOURCES
columns_for_modelling = list(set(application_test.dtypes[application_test.dtypes != 'object'].index.tolist())
                                 - set(['EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3','SK_ID_CURR']))
#we'll train an XGB Regression model for predicting missing EXT_SOURCE values
#we will predict in the order of least number of missing value columns to max.
for ext_col in ['EXT_SOURCE_2','EXT_SOURCE_3','EXT_SOURCE_1']:
#X_model - datapoints which do not have missing values of given column
#Y_train - values of column trying to predict with non missing values
#X_train_missing - datapoints in application_train with missing values
#X_test_missing - datapoints in application_test with missing values
X_model, X_train_missing, X_test_missing, Y_train = application_train[~application_train[ext_col].isna()][columns_for_modelling], application_train[
                                                    application_train[ext_col].isna()][columns_for_modelling], application_test[
                                                    application_test[ext_col].isna()][columns_for_modelling], application_train[
                                                    ext_col][~application_train[ext_col].isna()]
xg = XGBRegressor(n_estimators = 1000, max_depth = 3, learning_rate = 0.1, n_jobs = -1, random_state = 59)
xg.fit(X_model, Y_train)
application_train[ext_col][application_train[ext_col].isna()] = xg.predict(X_train_missing)
application_test[ext_col][application_test[ext_col].isna()] = xg.predict(X_test_missing)
#adding the predicted column to columns for modelling for next column's prediction
columns_for_modelling = columns_for_modelling + [ext_col]

In [25]:
reduced_app_train = app_trian.copy()

selected_columns = ['SK_ID_CURR','EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']

ext_source = reduced_app_train [selected_columns]
reduced_app_train.drop(['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3'], axis=1, inplace=True)

# **Missing Values**

In [None]:
functions.MissingValues(reduced_app_train)

In [27]:
num_value = -99999

if (reduced_app_train == num_value ).any().any():
    print(f"The value {num_value} exists in the DataFrame.")
else:
    print(f"The value {num_value} does not exist in the DataFrame.")

The value -99999 does not exist in the DataFrame.


In [28]:
for col in reduced_app_train.select_dtypes(include=['float16', 'float32', 'float64']).columns:
    reduced_app_train[col].fillna(num_value, inplace=True)

In [None]:
for col in reduced_app_train.select_dtypes(include=['float16', 'float32', 'float64']).columns:
    reduced_app_train[col].fillna(num_value, inplace=True)

In [29]:
cat_value = 'UNKNOWN'

if (app_train == cat_value).any().any():
    print(f"The value {cat_value} exists in the DataFrame.")
else:
    print(f"The value {cat_value} does not exist in the DataFrame.")

The value UNKNOWN does not exist in the DataFrame.


In [30]:
for col in reduced_app_train.select_dtypes(include=['object']).columns:
    reduced_app_train[col].fillna(cat_value, inplace=True)

## **WoE Encoder**

In [31]:
woe = WoEEncoder(fill_value=0.0001)
woe.fit(reduced_app_train, reduced_app_train[target])
data = woe.transform(reduced_app_train)

## **Merge EXT Source Data Back**

In [32]:
app_train = ext_source.merge(reduced_app_train, how='inner')

In [33]:
data = reduced_app_train.replace([np.inf, -np.inf], np.nan, inplace=True)  

def impute_column(data, column):

    data_missing = data[data[column].isna()] 
    data_no_missing = data.dropna(subset=[column])  
    
    X_train = data_no_missing.drop(columns=[column]) 
    y_train = data_no_missing[column]  

    X_to_predict = data_missing.drop(columns=[column])  

    xgb_reg = XGBRegressor()
    xgb_reg.fit(X_train, y_train)

    predicted_values = xgb_reg.predict(X_to_predict)

    data.loc[data[column].isna(), column] = predicted_values
    
    return data

columns_to_impute = ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']

for col in columns_to_impute:
    data = impute_column(data, col)

### **Correlation between each Feature and the Target Feature**

In [35]:
corr_matrix = data.corr(numeric_only=True)
corr_matrix[target].sort_values(ascending=False)

TARGET                          1.000000
OCCUPATION_TYPE                 0.079664
DAYS_BIRTH                      0.078239
ORGANIZATION_TYPE               0.071869
NAME_INCOME_TYPE                0.063318
REGION_RATING_CLIENT_W_CITY     0.060893
REGION_RATING_CLIENT            0.058899
NAME_EDUCATION_TYPE             0.057441
DAYS_LAST_PHONE_CHANGE          0.055219
CODE_GENDER                     0.054633
DAYS_ID_PUBLISH                 0.051457
REG_CITY_NOT_WORK_CITY          0.050994
FLAG_EMP_PHONE                  0.045982
REG_CITY_NOT_LIVE_CITY          0.044395
FLAG_DOCUMENT_3                 0.044346
WALLSMATERIAL_MODE              0.044065
EMERGENCYSTATE_MODE             0.042213
DAYS_REGISTRATION               0.041975
HOUSETYPE_MODE                  0.040699
NAME_FAMILY_STATUS              0.040291
OWN_CAR_AGE                     0.037612
NAME_HOUSING_TYPE               0.036947
LIVE_CITY_NOT_WORK_CITY         0.032518
DEF_30_CNT_SOCIAL_CIRCLE        0.032248
DEF_60_CNT_SOCIA