# **Libraries**

In [2]:
import pandas as pd
import numpy as np

import functions
import importlib
importlib.reload(functions)

import warnings

# **Display**

In [2]:
%matplotlib inline

pd.options.display.max_rows = 300000
pd.options.display.max_columns = 999
pd.options.display.max_colwidth = 500

warnings.filterwarnings("ignore")
warnings.simplefilter(action="ignore", category=FutureWarning)

pd.set_option('display.max_rows', 200)

size = 20

# **Variables**

In [3]:
random_state = 101
target = 'TARGET'

# **Data**

## **Load Data**

In [3]:
application_train = pd.read_csv(
    r"C:\Users\Dell\Documents\AI\Risk\Data\application_train.csv",
    index_col=False
)


application_test = pd.read_csv(
    r"C:\Users\Dell\Documents\AI\Risk\Data\application_test.csv",
    index_col=False
)

In [4]:
application_test.head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100001,Cash loans,F,N,Y,0,135000.0,568800.0,20560.5,450000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
1,100005,Cash loans,M,N,Y,0,99000.0,222768.0,17370.0,180000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
2,100013,Cash loans,M,Y,Y,0,202500.0,663264.0,69777.0,630000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,4.0
3,100028,Cash loans,F,N,Y,2,315000.0,1575000.0,49018.5,1575000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
4,100038,Cash loans,M,Y,N,1,180000.0,625500.0,32067.0,625500.0,...,0,0,0,0,,,,,,


# **Data Cleaning**

## **Removing Empty Features**

In [5]:
list_columns = functions.check_columns_with_one_uniquevalue(application_test)
    
print(list_columns)
print(f"There are {len(list_columns)} columns with just 1 unique value")

['FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21']
There are 11 columns with just 1 unique value


In [6]:
application_train = application_train.drop(list_columns, axis = 1)
application_test = application_test.drop(list_columns, axis = 1)

## **Reduce Memory Usage**

In [7]:
application_train = functions.reduce_memory_usage(application_train)

Memory usage of dataframe is 260.42 MB
Memory usage after optimization is: 89.15 MB
Decreased by 65.8%


In [8]:
application_test = functions.reduce_memory_usage(application_test)

Memory usage of dataframe is 40.91 MB
Memory usage after optimization is: 14.09 MB
Decreased by 65.6%


## **External Source Features**

In [None]:
import pandas as pd

# Sample existing DataFrame
data = pd.DataFrame({
    'A': [1, 2, 3, 4],
    'B': [5, 6, 7, 8],
    'C': [9, 10, 11, 12],
    'D': [13, 14, 15, 16],
    'E': [17, 18, 19, 20]
})

# List of columns to select
selected_columns = ['A', 'B', 'C', 'D']

# Create a new DataFrame with the selected features
new_df = data[selected_columns]

print(new_df)


In [None]:
import pandas as pd

# Sample existing DataFrame
data = pd.DataFrame({
    'A': [1, 2, 3, 4],
    'B': [5, 6, 7, 8],
    'C': [9, 10, 11, 12],
    'D': [13, 14, 15, 16],
    'E': [17, 18, 19, 20]
})

# List of columns to select
selected_columns = ['A', 'B', 'C', 'D']

# Create a new DataFrame with the selected features
new_df = data[selected_columns]

print(new_df)


# **Missing Values**

In [9]:
functions.MissingValues(application_train)

Unnamed: 0,NumberMissing,PercentageMissing
COMMONAREA_MEDI,214865,69.87
COMMONAREA_MODE,214865,69.87
COMMONAREA_AVG,214865,69.87
NONLIVINGAPARTMENTS_MODE,213514,69.43
NONLIVINGAPARTMENTS_MEDI,213514,69.43
NONLIVINGAPARTMENTS_AVG,213514,69.43
FONDKAPREMONT_MODE,210295,68.39
LIVINGAPARTMENTS_AVG,210199,68.35
LIVINGAPARTMENTS_MEDI,210199,68.35
LIVINGAPARTMENTS_MODE,210199,68.35


In [10]:
value = 0

if (application_train == value).any().any():
    print(f"The value {value} exists in the DataFrame.")
else:
    print(f"The value {value} does not exist in the DataFrame.")

The value 0 exists in the DataFrame.


In [11]:
data = application_train
num_value = value

for col in data.select_dtypes(include=['float16', 'float32', 'float64']).columns:
    data[col].fillna(num_value, inplace=True)

In [12]:
value = 'UNKNOWN'

if (application_train == value).any().any():
    print(f"The value {value} exists in the DataFrame.")
else:
    print(f"The value {value} does not exist in the DataFrame.")

The value UNKNOWN does not exist in the DataFrame.


In [13]:
cat_value = value

for col in data.select_dtypes(include=['object']).columns:
    data[col].fillna(cat_value, inplace=True)

In [14]:
functions.MissingValues(data)

Unnamed: 0,NumberMissing,PercentageMissing


# **Correlation**

In [15]:
corr_matrix = application_train.corr(numeric_only=True)
corr_matrix[target].sort_values(ascending=False)

TARGET                          1.000000
DAYS_BIRTH                      0.078239
REGION_RATING_CLIENT_W_CITY     0.060893
REGION_RATING_CLIENT            0.058899
DAYS_LAST_PHONE_CHANGE          0.055218
DAYS_ID_PUBLISH                 0.051457
REG_CITY_NOT_WORK_CITY          0.050994
FLAG_EMP_PHONE                  0.045982
REG_CITY_NOT_LIVE_CITY          0.044395
FLAG_DOCUMENT_3                 0.044346
DAYS_REGISTRATION               0.041975
LIVE_CITY_NOT_WORK_CITY         0.032518
DEF_30_CNT_SOCIAL_CIRCLE        0.032394
DEF_60_CNT_SOCIAL_CIRCLE        0.031401
FLAG_WORK_PHONE                 0.028524
CNT_CHILDREN                    0.019187
OBS_30_CNT_SOCIAL_CIRCLE        0.009447
OBS_60_CNT_SOCIAL_CIRCLE        0.009337
CNT_FAM_MEMBERS                 0.009312
REG_REGION_NOT_WORK_REGION      0.006942
REG_REGION_NOT_LIVE_REGION      0.005576
AMT_REQ_CREDIT_BUREAU_YEAR      0.005522
LIVE_REGION_NOT_WORK_REGION     0.002819
OWN_CAR_AGE                     0.002285
AMT_REQ_CREDIT_B