# Data Understanding

In [1]:
# import all necessary references
import pandas as pd
import importlib.metadata
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import yaml

print(

    f"""
Pandas version: {pd.__version__}
Matplotlib Pyplot version: {importlib.metadata.version("matplotlib")}
Seaborn version: {sns.__version__}
PyYAML version: {yaml.__version__}    
    """
)


Pandas version: 2.0.0
Matplotlib Pyplot version: 3.7.1
Seaborn version: 0.12.2
PyYAML version: 6.0    
    


In [2]:
pd.set_option("display.max_rows", 150)
pd.set_option("display.max_columns", 150)
pd.set_option("display.max_colwidth", None)
%matplotlib widget

## 1. Preparar o acesso aos dados

Local de armazenagem de dados brutos: `data/01_raw`

In [3]:
# Criar as referências para os arquivos
from pathlib import PurePath

raw_application_test_file_path = "../../data/01_raw/application_test.csv"
raw_application_train_file_path = "../../data/01_raw/application_train.csv"
raw_bureau_file_path = "../../data/01_raw/bureau.csv"
raw_bureau_balance_file_path = "../../data/01_raw/bureau_balance.csv"
raw_bureau_credit_card_balance_file_path = "../../data/01_raw/credit_card_balance.csv"
raw_installments_payments_file_path = "../../data/01_raw/credit_card_balance.csv"
raw_pos_cash_balance_file_path = "../../data/01_raw/POS_CASH_balance.csv"
raw_previous_application_file_path = "../../data/01_raw/previous_application.csv"
raw_sample_submission_file_path = "../../data/01_raw/sample_submission.csv"
home_credit_columns_description_file_path = "../../data/01_raw/HomeCredit_columns_description.csv"


raw_application_test_file = PurePath(raw_application_test_file_path)
raw_application_train_file = PurePath(raw_application_train_file_path)
raw_bureau_file = PurePath(raw_bureau_file_path)
raw_bureau_balance_file = PurePath(raw_bureau_balance_file_path)
raw_bureau_credit_card_balance_file = PurePath(raw_bureau_credit_card_balance_file_path)
raw_installments_payments_file = PurePath(raw_installments_payments_file_path)
raw_pos_cash_balance_file = PurePath(raw_pos_cash_balance_file_path)
raw_previous_application_file = PurePath(raw_previous_application_file_path)
raw_sample_submission_file = PurePath(raw_sample_submission_file_path)
home_credit_columns_description_file = PurePath(home_credit_columns_description_file_path)

In [4]:
home_credit_descriptors_df = pd.read_csv(
    filepath_or_buffer=home_credit_columns_description_file,
    engine="pyarrow",
    # dtype_backend="pyarrow",
    encoding='unicode_escape'
)
raw_application_train_df = pd.read_csv(
    filepath_or_buffer=raw_application_train_file,
    index_col="SK_ID_CURR",
    parse_dates=True,
    engine="pyarrow",
    # dtype_backend="pyarrow",
    encoding='unicode_escape'
)
raw_application_test_df = pd.read_csv(
    filepath_or_buffer=raw_application_test_file,
    index_col="SK_ID_CURR",
    parse_dates=True,
    engine="pyarrow",
    # dtype_backend="pyarrow",
    encoding='unicode_escape'
)

## 2. Entender tipos e significados dos dados

In [5]:
home_credit_descriptors_df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 219 entries, 0 to 218
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0                219 non-null    int64 
 1   Table        219 non-null    object
 2   Row          219 non-null    object
 3   Description  219 non-null    object
 4   Special      219 non-null    object
dtypes: int64(1), object(4)
memory usage: 8.7+ KB


In [6]:
# segregar os descritores apenas da tabela de trabalho atual
application_descriptors = home_credit_descriptors_df[home_credit_descriptors_df["Table"]=="application_{train|test}.csv"]
# application_descriptors.style.set_properties(
#     **{
#     'overflow-wrap': 'break-word',
#     },
#     subset='Description'
# )
application_descriptors.head(125)

Unnamed: 0,Unnamed: 1,Table,Row,Description,Special
0,1,application_{train|test}.csv,SK_ID_CURR,ID of loan in our sample,
1,2,application_{train|test}.csv,TARGET,"Target variable (1 - client with payment difficulties: he/she had late payment more than X days on at least one of the first Y installments of the loan in our sample, 0 - all other cases)",
2,5,application_{train|test}.csv,NAME_CONTRACT_TYPE,Identification if loan is cash or revolving,
3,6,application_{train|test}.csv,CODE_GENDER,Gender of the client,
4,7,application_{train|test}.csv,FLAG_OWN_CAR,Flag if the client owns a car,
5,8,application_{train|test}.csv,FLAG_OWN_REALTY,Flag if client owns a house or flat,
6,9,application_{train|test}.csv,CNT_CHILDREN,Number of children the client has,
7,10,application_{train|test}.csv,AMT_INCOME_TOTAL,Income of the client,
8,11,application_{train|test}.csv,AMT_CREDIT,Credit amount of the loan,
9,12,application_{train|test}.csv,AMT_ANNUITY,Loan annuity,


In [7]:
raw_application_train_df.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 307511 entries, 100002 to 456255
Data columns (total 121 columns):
 #    Column                        Non-Null Count   Dtype  
---   ------                        --------------   -----  
 0    TARGET                        307511 non-null  int64  
 1    NAME_CONTRACT_TYPE            307511 non-null  object 
 2    CODE_GENDER                   307511 non-null  object 
 3    FLAG_OWN_CAR                  307511 non-null  object 
 4    FLAG_OWN_REALTY               307511 non-null  object 
 5    CNT_CHILDREN                  307511 non-null  int64  
 6    AMT_INCOME_TOTAL              307511 non-null  float64
 7    AMT_CREDIT                    307511 non-null  float64
 8    AMT_ANNUITY                   307499 non-null  float64
 9    AMT_GOODS_PRICE               307233 non-null  float64
 10   NAME_TYPE_SUITE               307511 non-null  object 
 11   NAME_INCOME_TYPE              307511 non-null  object 
 12   NAME_EDUCATION_TYPE         

In [8]:
raw_application_train_df.head(30)

Unnamed: 0_level_0,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,APARTMENTS_MODE,BASEMENTAREA_MODE,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_MODE,COMMONAREA_MODE,ELEVATORS_MODE,ENTRANCES_MODE,FLOORSMAX_MODE,FLOORSMIN_MODE,LANDAREA_MODE,LIVINGAPARTMENTS_MODE,LIVINGAREA_MODE,NONLIVINGAPARTMENTS_MODE,NONLIVINGAREA_MODE,APARTMENTS_MEDI,BASEMENTAREA_MEDI,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BUILD_MEDI,COMMONAREA_MEDI,ELEVATORS_MEDI,ENTRANCES_MEDI,FLOORSMAX_MEDI,FLOORSMIN_MEDI,LANDAREA_MEDI,LIVINGAPARTMENTS_MEDI,LIVINGAREA_MEDI,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAREA_MEDI,FONDKAPREMONT_MODE,HOUSETYPE_MODE,TOTALAREA_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1
100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,351000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.018801,-9461,-637,-3648.0,-2120,,1,1,0,1,1,0,Laborers,1.0,2,2,WEDNESDAY,10,0,0,0,0,0,0,Business Entity Type 3,0.083037,0.262949,0.139376,0.0247,0.0369,0.9722,0.6192,0.0143,0.0,0.069,0.0833,0.125,0.0369,0.0202,0.019,0.0,0.0,0.0252,0.0383,0.9722,0.6341,0.0144,0.0,0.069,0.0833,0.125,0.0377,0.022,0.0198,0.0,0.0,0.025,0.0369,0.9722,0.6243,0.0144,0.0,0.069,0.0833,0.125,0.0375,0.0205,0.0193,0.0,0.0,reg oper account,block of flats,0.0149,"Stone, brick",No,2.0,2.0,2.0,2.0,-1134.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,1129500.0,Family,State servant,Higher education,Married,House / apartment,0.003541,-16765,-1188,-1186.0,-291,,1,1,0,1,1,0,Core staff,2.0,1,1,MONDAY,11,0,0,0,0,0,0,School,0.311267,0.622246,,0.0959,0.0529,0.9851,0.796,0.0605,0.08,0.0345,0.2917,0.3333,0.013,0.0773,0.0549,0.0039,0.0098,0.0924,0.0538,0.9851,0.804,0.0497,0.0806,0.0345,0.2917,0.3333,0.0128,0.079,0.0554,0.0,0.0,0.0968,0.0529,0.9851,0.7987,0.0608,0.08,0.0345,0.2917,0.3333,0.0132,0.0787,0.0558,0.0039,0.01,reg oper account,block of flats,0.0714,Block,No,1.0,0.0,1.0,0.0,-828.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,135000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.010032,-19046,-225,-4260.0,-2531,26.0,1,1,1,1,1,0,Laborers,1.0,2,2,MONDAY,9,0,0,0,0,0,0,Government,,0.555912,0.729567,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-815.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,297000.0,Unaccompanied,Working,Secondary / secondary special,Civil marriage,House / apartment,0.008019,-19005,-3039,-9833.0,-2437,,1,1,0,1,0,0,Laborers,2.0,2,2,WEDNESDAY,17,0,0,0,0,0,0,Business Entity Type 3,,0.650442,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,0.0,2.0,0.0,-617.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,
100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,513000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.028663,-19932,-3038,-4311.0,-3458,,1,1,0,1,0,0,Core staff,1.0,2,2,THURSDAY,11,0,0,0,0,1,1,Religion,,0.322738,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-1106.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
100008,0,Cash loans,M,N,Y,0,99000.0,490495.5,27517.5,454500.0,"Spouse, partner",State servant,Secondary / secondary special,Married,House / apartment,0.035792,-16941,-1588,-4970.0,-477,,1,1,1,1,1,0,Laborers,2.0,2,2,WEDNESDAY,16,0,0,0,0,0,0,Other,,0.354225,0.621226,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-2536.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0,1.0
100009,0,Cash loans,F,Y,Y,1,171000.0,1560726.0,41301.0,1395000.0,Unaccompanied,Commercial associate,Higher education,Married,House / apartment,0.035792,-13778,-3130,-1213.0,-619,17.0,1,1,0,1,1,0,Accountants,3.0,2,2,SUNDAY,16,0,0,0,0,0,0,Business Entity Type 3,0.774761,0.724,0.49206,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,0.0,1.0,0.0,-1562.0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0.0,0.0,0.0,1.0,1.0,2.0
100010,0,Cash loans,M,Y,Y,0,360000.0,1530000.0,42075.0,1530000.0,Unaccompanied,State servant,Higher education,Married,House / apartment,0.003122,-18850,-449,-4597.0,-2379,8.0,1,1,1,1,0,0,Managers,2.0,3,3,MONDAY,16,0,0,0,0,1,1,Other,,0.714279,0.540654,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,0.0,2.0,0.0,-1070.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
100011,0,Cash loans,F,N,Y,0,112500.0,1019610.0,33826.5,913500.0,Children,Pensioner,Secondary / secondary special,Married,House / apartment,0.018634,-20099,365243,-7427.0,-3514,,1,0,0,1,0,0,,2.0,2,2,WEDNESDAY,14,0,0,0,0,0,0,XNA,0.587334,0.205747,0.751724,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,0.0,1.0,0.0,0.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
100012,0,Revolving loans,M,N,Y,0,135000.0,405000.0,20250.0,405000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.019689,-14469,-2019,-14437.0,-3992,,1,1,0,1,0,0,Laborers,1.0,2,2,THURSDAY,8,0,0,0,0,0,0,Electricity,,0.746644,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,0.0,2.0,0.0,-1673.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,


### 2.1. Classificar os tipos de dados

In [9]:
unique_id_variables = ["SK_ID_CURR"]

target_variables = ["TARGET"]

cat_variables = [
    col
    for col in raw_application_train_df.columns.to_list()
    if any(substring in col for substring in ["FLAG", "NAME", "RATING", "CODE"])
] + [
    "WEEKDAY_APPR_PROCESS_START",
    "FONDKAPREMONT_MODE",
    "HOUSETYPE_MODE",
    "NAME_TYPE_SUITE",
    "WALLSMATERIAL_MODE",
    "EMERGENCYSTATE_MODE",
]
cat_variables = list(
    set(cat_variables).difference(
        set(
            col
            for col in raw_application_train_df.columns.to_list()
            if any(substring in col for substring in ["FLAG_DOCUMENT"])
        )
    )
)

int_variables = [
    col
    for col in raw_application_train_df.columns.to_list()
    if any(substring in col for substring in ["AMT_REQ_CREDIT"])
]

float_variables = [
    col
    for col in raw_application_train_df.columns.to_list()
    if any(substring in col for substring in ["AMT_", "EXT_", "_AVG", "_MODE", "_MEDI"])
] + [
    "REGION_POPULATION_RELATIVE",
]
float_variables = list(
    set(float_variables).difference(set(cat_variables)).difference(set(int_variables))
)

time_interval_variables = [
    col
    for col in raw_application_train_df.columns.to_list()
    if any(
        substring in col for substring in ["YEARS", "MONTHS", "DAYS", "HOURS", "MINS"]
    )
]
time_interval_variables = list(
    set(time_interval_variables).difference(set(float_variables))
)

datetime_variables = [
    col
    for col in raw_application_train_df.columns.to_list()
    if any(substring in col for substring in ["YEAR", "MONTH", "DAY", "HOUR"])
]
datetime_variables = list(
    set(datetime_variables)
    .difference(set(cat_variables))
    .difference(set(int_variables))
    .difference(set(float_variables))
    .difference(set(time_interval_variables))
)


text_variables = []

print(
    f"""
Unique ID Variables: {unique_id_variables}

Target Variables: {target_variables}

Categorial Variables: {cat_variables}

Integer Variables: {int_variables}

Float Variables: {float_variables}

Time Interval Variables: {time_interval_variables}

Datetime Variables: {datetime_variables}

Text Variables: {text_variables}
"""
)


Unique ID Variables: ['SK_ID_CURR']

Target Variables: ['TARGET']

Categorial Variables: ['FONDKAPREMONT_MODE', 'FLAG_EMAIL', 'WEEKDAY_APPR_PROCESS_START', 'CODE_GENDER', 'EMERGENCYSTATE_MODE', 'FLAG_CONT_MOBILE', 'NAME_FAMILY_STATUS', 'REGION_RATING_CLIENT_W_CITY', 'HOUSETYPE_MODE', 'FLAG_WORK_PHONE', 'NAME_INCOME_TYPE', 'FLAG_OWN_CAR', 'NAME_TYPE_SUITE', 'FLAG_MOBIL', 'FLAG_PHONE', 'FLAG_OWN_REALTY', 'NAME_CONTRACT_TYPE', 'WALLSMATERIAL_MODE', 'NAME_HOUSING_TYPE', 'REGION_RATING_CLIENT', 'NAME_EDUCATION_TYPE', 'FLAG_EMP_PHONE']

Integer Variables: ['AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK', 'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT', 'AMT_REQ_CREDIT_BUREAU_YEAR']

Float Variables: ['COMMONAREA_AVG', 'BASEMENTAREA_MODE', 'NONLIVINGAPARTMENTS_MODE', 'EXT_SOURCE_3', 'ELEVATORS_AVG', 'YEARS_BEGINEXPLUATATION_MODE', 'COMMONAREA_MODE', 'FLOORSMAX_MEDI', 'BASEMENTAREA_AVG', 'AMT_INCOME_TOTAL', 'LANDAREA_AVG', 'EXT_SOURCE_1', 'REGI

In [10]:
cat_ordinal_vars = [
    "REGION_RATING_CLIENT",
    "REGION_RATING_CLIENT_W_CITY"
]

cat_nominal_vars = list(
    set(cat_variables)
    .difference(set(cat_ordinal_vars))
)

num_continuous_vars = float_variables.copy()

num_discrete_vars = int_variables + time_interval_variables



print(
    f"""
Categorial Ordinal Variables: {cat_ordinal_vars}

Categorial Nominal Variables: {cat_nominal_vars}

Numeric Continuous Variables: {num_continuous_vars}

Numeric Discrete Variables: {num_discrete_vars}

Datetime Variables: {datetime_variables}
"""
)


Categorial Ordinal Variables: ['REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY']

Categorial Nominal Variables: ['FONDKAPREMONT_MODE', 'FLAG_EMAIL', 'WEEKDAY_APPR_PROCESS_START', 'CODE_GENDER', 'NAME_TYPE_SUITE', 'FLAG_MOBIL', 'NAME_EDUCATION_TYPE', 'FLAG_PHONE', 'EMERGENCYSTATE_MODE', 'FLAG_OWN_REALTY', 'NAME_CONTRACT_TYPE', 'FLAG_CONT_MOBILE', 'WALLSMATERIAL_MODE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'HOUSETYPE_MODE', 'FLAG_WORK_PHONE', 'NAME_INCOME_TYPE', 'FLAG_EMP_PHONE', 'FLAG_OWN_CAR']

Numeric Continuous Variables: ['COMMONAREA_AVG', 'BASEMENTAREA_MODE', 'NONLIVINGAPARTMENTS_MODE', 'EXT_SOURCE_3', 'ELEVATORS_AVG', 'YEARS_BEGINEXPLUATATION_MODE', 'COMMONAREA_MODE', 'FLOORSMAX_MEDI', 'BASEMENTAREA_AVG', 'AMT_INCOME_TOTAL', 'LANDAREA_AVG', 'EXT_SOURCE_1', 'REGION_POPULATION_RELATIVE', 'BASEMENTAREA_MEDI', 'APARTMENTS_MODE', 'NONLIVINGAREA_MODE', 'YEARS_BUILD_AVG', 'NONLIVINGAPARTMENTS_AVG', 'NONLIVINGAREA_AVG', 'NONLIVINGAPARTMENTS_MEDI', 'LIVINGAPARTMENTS_MODE', 'FLOO

In [11]:
project_parameters_file = PurePath("../../conf/base/parameters.yml")

with open(project_parameters_file, "r") as file:
    project_parameters = yaml.safe_load(file)

project_parameters


{'dataset_variables_types': {'application_test': {'categorical_nominal': ['EMERGENCYSTATE_MODE',
    'FLAG_OWN_REALTY',
    'NAME_EDUCATION_TYPE',
    'WEEKDAY_APPR_PROCESS_START',
    'FLAG_MOBIL',
    'FLAG_EMP_PHONE',
    'FONDKAPREMONT_MODE',
    'FLAG_OWN_CAR',
    'WALLSMATERIAL_MODE',
    'NAME_TYPE_SUITE',
    'FLAG_PHONE',
    'FLAG_CONT_MOBILE',
    'NAME_CONTRACT_TYPE',
    'CODE_GENDER',
    'HOUSETYPE_MODE',
    'NAME_INCOME_TYPE',
    'NAME_FAMILY_STATUS',
    'NAME_HOUSING_TYPE',
    'FLAG_WORK_PHONE',
    'FLAG_EMAIL'],
   'categorical_ordinal': ['REGION_RATING_CLIENT',
    'REGION_RATING_CLIENT_W_CITY'],
   'datetime': ['HOUR_APPR_PROCESS_START'],
   'numerical_continuous': ['COMMONAREA_AVG',
    'LANDAREA_MODE',
    'YEARS_BEGINEXPLUATATION_MEDI',
    'TOTALAREA_MODE',
    'LANDAREA_AVG',
    'BASEMENTAREA_MEDI',
    'NONLIVINGAREA_MEDI',
    'LANDAREA_MEDI',
    'FLOORSMAX_AVG',
    'ENTRANCES_MODE',
    'LIVINGAPARTMENTS_MODE',
    'YEARS_BEGINEXPLUATATION_AVG',
   

In [12]:
project_parameters["dataset_variables_types"] = {
    "application_train": {
        "categorical_ordinal": cat_ordinal_vars,
        "categorical_nominal": cat_nominal_vars,
        "numerical_continuous": num_continuous_vars,
        "numerical_discrete": num_discrete_vars,
        "datetime": datetime_variables
    },
    "application_test": {
        "categorical_ordinal": cat_ordinal_vars,
        "categorical_nominal": cat_nominal_vars,
        "numerical_continuous": num_continuous_vars,
        "numerical_discrete": num_discrete_vars,
        "datetime": datetime_variables
    }
}

project_parameters["target"] = target_variables


In [13]:
with open(project_parameters_file, "w") as file:
    yaml.safe_dump(project_parameters, file)