# 1. Config

In [1]:
import pandas as pd
import numpy as np
import scipy.stats
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns



In [2]:
pd.set_option("display.max_columns", None)
import gc
gc.enable()

# 2. Essential Functions

In [3]:
def check_missing_value_col(df, n):
    null_percent_col = (df.isnull().sum() / df.shape[0]) * 100
    df_null_percent_col = pd.DataFrame(null_percent_col).reset_index().rename(columns={0: 'missing_percent', 'index': 'feature'})
    df_null_percent_col = df_null_percent_col.sort_values('missing_percent', ascending=False)
    return df_null_percent_col[df_null_percent_col['missing_percent'] > n]
def statitic_missing_value_col(df):
    for m in [0,5,10,30,40,50,100]:
        print(f"The number of colums have the null percentage > {m}",check_missing_value_col(df, m).shape[0])

In [4]:
def check_missing_value_row(df, n):
    null_percent_row = (df.isnull().sum(axis=1) / df.shape[1]) * 100
    df_null_percent_row = pd.DataFrame(null_percent_row, columns=['missing_percent']).reset_index().rename(columns={'index': 'application'})
    return df_null_percent_row[df_null_percent_row['missing_percent'] > n]

def statitic_missing_value_row(df):
    for m in [0,5, 10, 15, 50, 100]:
        print(f"The number of rows with the null percentage > {m}: ", check_missing_value_row(df, m).shape[0])


In [5]:
def check_imbalance(data, column, threshold=0.8):
    value_counts = data[column].value_counts(normalize=True)  # Use normalize=True to get percentages
    max_percentage = value_counts.max()  # Maximum percentage
    most_frequent_value = value_counts.idxmax()  # Value with the maximum frequency
    
    if max_percentage > threshold:
        print(f"Column '{column}' is imbalanced. The most frequent value '{most_frequent_value}' appears in {max_percentage*100:.2f}% of the data.")
        return True
    else:
        return None

In [6]:
def convert_days(data, features, t = 12, rounding = True, replace = False):
    for var in features:
        if replace == True:
            if rounding == True:
                data[var] = abs(data[var])//t
            else:
                data[var] = abs(data[var])/t
        else:
            if rounding == True:
                data["CONVERTED_" + str(var)] = abs(data[var])//t
            else:
                data["CONVERTED_" + str(var)] = abs(data[var])/t
    return data

In [7]:
def _tbl_dtype(data):
    sum_dtype = pd.DataFrame(data.dtypes).sort_values(0).rename(columns = {0:'Data Type'})
    return sum_dtype

In [8]:
def identify_columns_with_outliers(data, method='iqr'):
    columns_with_outliers = []

    for column in data.select_dtypes(include='number').columns:
        if method == 'iqr':
            # Interquartile Range (IQR) method
            Q1 = data[column].quantile(0.25)
            Q3 = data[column].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            if ((data[column] < lower_bound) | (data[column] > upper_bound)).any():
                columns_with_outliers.append(column)
        elif method == 'std':
            # Standard Deviation method
            mean = data[column].mean()
            std_dev = data[column].std()
            lower_bound = mean - 3 * std_dev
            upper_bound = mean + 3 * std_dev
            if ((data[column] < lower_bound) | (data[column] > upper_bound)).any():
                columns_with_outliers.append(column)
    
    return columns_with_outliers

def plot_boxplots_for_outlier_columns(data, method='iqr'):
    columns_with_outliers = identify_columns_with_outliers(data, method)

    if not columns_with_outliers:
        print("No columns with outliers found.")
        return

    num_columns = len(columns_with_outliers)
    num_rows = (num_columns + 2) // 3  # Arrange in a grid with 3 columns per row

    fig, axes = plt.subplots(num_rows, 3, figsize=(15, 5 * num_rows))
    axes = axes.flatten()

    for idx, column in enumerate(columns_with_outliers):
        sns.boxplot(x=data[column], ax=axes[idx])
        axes[idx].set_title(f'Box Plot for {column}')
        axes[idx].set_xlabel('Value')

    for idx in range(num_columns, len(axes)):
        axes[idx].axis('off')

    plt.tight_layout()
    plt.show()


In [9]:
def encode_column(data, columns, encoding_type='label'):
    for column in columns:
        if encoding_type == 'label':
            # Label Encoding
            le = LabelEncoder()
            data.loc[:, column] = le.fit_transform(data[column])
        elif encoding_type == 'onehot':
            # One-Hot Encoding
            data = pd.get_dummies(data, columns=[column], drop_first=False)
        else:
            print(f"Invalid encoding type: {encoding_type}. Choose 'label' or 'onehot'.")
            return data

    return data

In [10]:
def create_logs(data, columns, replace=False):
    for var in columns:
        if replace:
            data.loc[:, var] = np.log(data[var].abs() + 1)
        else:
            data[var + "_LOG"] = np.log(data[var].abs() + 1)
    return data

# 3. Data Import

In [11]:
train = pd.read_csv('dseb63_application_train.csv')
train.shape

(246009, 123)

In [12]:
test = pd.read_csv('dseb63_application_test.csv')
test.shape

(61502, 122)

In [13]:
# extract target
y = train[["SK_ID_CURR", "TARGET"]]
del train["TARGET"]

In [14]:
# concatenate application data
appl = pd.concat([train, test])
# del train, test

# 4. PREPROCESSING

## Handling missing data

In [15]:
statitic_missing_value_col(appl)
statitic_missing_value_row(appl)

The number of colums have the null percentage > 0 67
The number of colums have the null percentage > 5 57
The number of colums have the null percentage > 10 57
The number of colums have the null percentage > 30 50
The number of colums have the null percentage > 40 49
The number of colums have the null percentage > 50 41
The number of colums have the null percentage > 100 0
The number of rows with the null percentage > 0:  298909
The number of rows with the null percentage > 5:  228128
The number of rows with the null percentage > 10:  209969
The number of rows with the null percentage > 15:  191472
The number of rows with the null percentage > 50:  0
The number of rows with the null percentage > 100:  0


In [16]:
table = check_missing_value_col(appl, 0)
table

Unnamed: 0,feature,missing_percent
61,COMMONAREA_MODE,69.872297
47,COMMONAREA_AVG,69.872297
75,COMMONAREA_MEDI,69.872297
83,NONLIVINGAPARTMENTS_MEDI,69.432963
55,NONLIVINGAPARTMENTS_AVG,69.432963
...,...,...
41,EXT_SOURCE_2,0.214626
9,AMT_GOODS_PRICE,0.090403
8,AMT_ANNUITY,0.003902
28,CNT_FAM_MEMBERS,0.000650


In [17]:
##### FEATURE REMOVAL
drops = ['APARTMENTS_MEDI', 'BASEMENTAREA_MEDI', 'COMMONAREA_MEDI', 'ELEVATORS_MEDI', 'ENTRANCES_MEDI', 
         'FLOORSMAX_MEDI', 'FLOORSMIN_MEDI', 'LANDAREA_MEDI', 'LIVINGAPARTMENTS_MEDI', 'LIVINGAREA_MEDI',
         'NONLIVINGAPARTMENTS_MEDI', 'NONLIVINGAREA_MEDI','YEARS_BEGINEXPLUATATION_MEDI', 'YEARS_BUILD_MEDI',
         'APARTMENTS_MODE', 'BASEMENTAREA_MODE', 'COMMONAREA_MODE','ELEVATORS_MODE', 'ENTRANCES_MODE', 
         'FLOORSMAX_MODE', 'FLOORSMIN_MODE', 'LANDAREA_MODE', 'LIVINGAPARTMENTS_MODE', 'LIVINGAREA_MODE', 
         'NONLIVINGAPARTMENTS_MODE', 'NONLIVINGAREA_MODE', 'TOTALAREA_MODE',  'YEARS_BEGINEXPLUATATION_MODE']
appl = appl.drop(columns = drops)
# appl.drop(columns = drops).shape

Cột trên 50 drop, ưu tiên giữ avg mà mode rác. Dòng kbiet, trừ cái ext_source 1

In [18]:
# appl = appl.dropna(axis=1, thresh=appl.shape[0]*0.55)
appl.shape

(307511, 94)

In [19]:
statitic_missing_value_col(appl)
statitic_missing_value_row(appl)

The number of colums have the null percentage > 0 39
The number of colums have the null percentage > 5 29
The number of colums have the null percentage > 10 29
The number of colums have the null percentage > 30 22
The number of colums have the null percentage > 40 21
The number of colums have the null percentage > 50 18
The number of colums have the null percentage > 100 0
The number of rows with the null percentage > 0:  298909
The number of rows with the null percentage > 5:  224782
The number of rows with the null percentage > 10:  189773
The number of rows with the null percentage > 15:  160768
The number of rows with the null percentage > 50:  0
The number of rows with the null percentage > 100:  0


In [20]:
#Drop col - Unnamed rác
appl = appl.drop(appl.columns[0], axis=1)
appl.shape

(307511, 93)

## Check duplicate

In [21]:
appl[appl.duplicated('SK_ID_CURR')].shape[0]

0

## Check Imbalance(Draft)

In [22]:
#Imbalance Data[Mai hỏi, chịu] #Cái này EDA sẽ tốt hơn
for col in appl.columns:
    check_imbalance(appl, col, threshold=0.95)

Column 'FLAG_MOBIL' is imbalanced. The most frequent value '1' appears in 100.00% of the data.
Column 'FLAG_CONT_MOBILE' is imbalanced. The most frequent value '1' appears in 99.81% of the data.
Column 'REG_REGION_NOT_LIVE_REGION' is imbalanced. The most frequent value '0' appears in 98.49% of the data.
Column 'LIVE_REGION_NOT_WORK_REGION' is imbalanced. The most frequent value '0' appears in 95.93% of the data.
Column 'HOUSETYPE_MODE' is imbalanced. The most frequent value 'block of flats' appears in 98.23% of the data.
Column 'EMERGENCYSTATE_MODE' is imbalanced. The most frequent value 'No' appears in 98.56% of the data.
Column 'FLAG_DOCUMENT_2' is imbalanced. The most frequent value '0' appears in 100.00% of the data.
Column 'FLAG_DOCUMENT_4' is imbalanced. The most frequent value '0' appears in 99.99% of the data.
Column 'FLAG_DOCUMENT_5' is imbalanced. The most frequent value '0' appears in 98.49% of the data.
Column 'FLAG_DOCUMENT_7' is imbalanced. The most frequent value '0' app

Hầu hết đêù là flag, k xóa đc, còn nx cái kia ch biết xử lí sao. Mai tính nhé.......

## Detecting and treating outliers

In [23]:
# appl['AGE_REGISTRATION'] = abs(appl['DAYS_REGISTRATION'] // 365)
# appl['AGE_BIRTH'] = abs(appl['DAYS_BIRTH'] // 365)
# appl['AGE_EMPLOYED'] = abs(appl['DAYS_EMPLOYED'] // 365)

# appl[['AGE_REGISTRATION', 'AGE_BIRTH', 'AGE_EMPLOYED']].describe()

In [24]:
# plot_boxplots_for_outlier_columns(train, method='iqr')

AMT_INCOME_TOTAL standardize quá cao, trông cẩn thận, có thể nên box plot ra để xem outlier. t vẽ đại. Có mấy cái nhìn lạ lạ nma t ch chưa có kn xử lí mai tính. À chắc xóa mấy cái flag néu muốn vẽ

## Feature engineer

In [None]:
#4 instances of 'XNA' in code gender
appl['CODE_GENDER'] = appl['CODE_GENDER'].replace('XNA', np.random.choice(['F', 'M']))
#XNA 2 in NAME_FAMILY_STATUS
appl['NAME_FAMILY_STATUS'] = appl['NAME_FAMILY_STATUS'].replace('Unknown', np.random.choice(['Married', 'Civil marriage', 'Single / not married', 'Widow','Separated']))
#DAYS_EMPLOYED
#Chuyển từ ngày thành năm.[nên scale về day hết, hay month, hay year, v để tách lẻ có sao k.] 
appl = convert_days(appl, ['DAYS_BIRTH','DAYS_EMPLOYED', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH','DAYS_LAST_PHONE_CHANGE'], t=30, rounding=False, replace=False)
appl = appl.drop(columns=['DAYS_BIRTH','DAYS_EMPLOYED', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH','DAYS_LAST_PHONE_CHANGE'], axis=1)
appl['CONVERTED_DAYS_BIRTH'] = np.where(appl['CONVERTED_DAYS_BIRTH'] > 4800, np.nan, appl['CONVERTED_DAYS_BIRTH'])
#XNA 2 in ORGANIZATION+TYPE
##Vl xủ lí sao mấy thg unknown toàn mấy thằng sống trên 1000 năm cười ẻ
appl['ORGANIZATION_TYPE'] = appl['ORGANIZATION_TYPE'].replace('XNA', np.nan)
##Doc_document
doc_vars = [f"FLAG_DOCUMENT_{i}" for i in range(2, 22)]
appl["NUM_DOCUMENTS"] = appl[doc_vars].sum(axis=1)
appl = appl.drop(columns=doc_vars)
# Application date: classify as "Working day" or "Weekend"
appl["DAY_APPR_PROCESS_START"] = "Working day"
appl.loc[
    (appl["WEEKDAY_APPR_PROCESS_START"] == "SATURDAY") | 
    (appl["WEEKDAY_APPR_PROCESS_START"] == "SUNDAY"),
    "DAY_APPR_PROCESS_START"
] = "Weekend"

In [26]:
abs(appl['DAYS_BIRTH']).describe()

count    307511.000000
mean      16036.995067
std        4363.988632
min        7489.000000
25%       12413.000000
50%       15750.000000
75%       19682.000000
max       25229.000000
Name: DAYS_BIRTH, dtype: float64

In [None]:
appl.loc[appl[appl['FLAG_OWN_CAR'] == 'N'].index, 'OWN_CAR_AGE'] = -1 

In [None]:
appl

Unnamed: 0,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,YEARS_BUILD_MODE,FONDKAPREMONT_MODE,HOUSETYPE_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,SK_ID_CURR,CONVERTED_DAYS_BIRTH,CONVERTED_DAYS_EMPLOYED,CONVERTED_DAYS_REGISTRATION,CONVERTED_DAYS_ID_PUBLISH,CONVERTED_DAYS_LAST_PHONE_CHANGE,NUM_DOCUMENTS,DAY_APPR_PROCESS_START,OWN_CAR_AGE
0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,1129500.0,Family,State servant,Higher education,Married,House / apartment,0.003541,1,1,0,1,1,0,Core staff,2.0,1,1,MONDAY,11,0,0,0,0,0,0,School,0.311267,0.622246,,0.0959,0.0529,0.9851,0.7960,0.0605,0.08,0.0345,0.2917,0.3333,0.0130,0.0773,0.0549,0.0039,0.0098,0.8040,reg oper account,block of flats,Block,No,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,278621,558.833333,39.600000,39.533333,9.700000,27.600000,1,Working day,-1.0
1,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,297000.0,Unaccompanied,Working,Secondary / secondary special,Civil marriage,House / apartment,0.008019,1,1,0,1,0,0,Laborers,2.0,2,2,WEDNESDAY,17,0,0,0,0,0,0,Business Entity Type 3,,0.650442,,,,,,,,,,,,,,,,,,,,,2.0,0.0,2.0,0.0,,,,,,,139008,633.500000,101.300000,327.766667,81.233333,20.566667,1,Working day,-1.0
2,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,513000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.028663,1,1,0,1,0,0,Core staff,1.0,2,2,THURSDAY,11,0,0,0,0,1,1,Religion,,0.322738,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,138348,664.400000,101.266667,143.700000,115.266667,36.866667,1,Working day,-1.0
3,Cash loans,M,N,Y,0,99000.0,490495.5,27517.5,454500.0,"Spouse, partner",State servant,Secondary / secondary special,Married,House / apartment,0.035792,1,1,1,1,1,0,Laborers,2.0,2,2,WEDNESDAY,16,0,0,0,0,0,0,Other,,0.354225,0.621226,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,64140,564.700000,52.933333,165.666667,15.900000,84.533333,1,Working day,-1.0
4,Cash loans,M,Y,Y,0,360000.0,1530000.0,42075.0,1530000.0,Unaccompanied,State servant,Higher education,Married,House / apartment,0.003122,1,1,1,1,0,0,Managers,2.0,3,3,MONDAY,16,0,0,0,0,1,1,Other,,0.714279,0.540654,,,,,,,,,,,,,,,,,,,,2.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,219374,628.333333,14.966667,153.233333,79.300000,35.666667,1,Working day,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61497,Cash loans,M,Y,Y,0,171000.0,521280.0,23089.5,450000.0,Family,Pensioner,Higher education,Married,House / apartment,0.018634,1,0,0,1,0,0,,2.0,2,2,SATURDAY,10,0,0,0,0,0,0,,0.571886,0.560189,0.410103,,,,,,,,,,,,,,,,,,,,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,4.0,150442,793.100000,12174.766667,125.933333,182.833333,53.766667,1,Weekend,-1.0
61498,Revolving loans,M,Y,Y,2,450000.0,900000.0,45000.0,900000.0,Family,Working,Secondary / secondary special,Married,Municipal apartment,0.003541,1,1,0,1,0,0,Laborers,4.0,1,1,SUNDAY,13,0,0,0,0,0,0,Trade: type 7,0.581535,0.401592,0.307737,0.0165,0.0000,0.9866,0.8164,0.0053,0.00,0.0690,0.0417,0.0833,0.0101,0.0132,0.0096,0.0010,0.0296,0.8301,reg oper account,block of flats,Wooden,Yes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,5217,620.733333,69.900000,177.833333,71.766667,54.600000,0,Weekend,
61499,Cash loans,F,N,Y,0,225000.0,202500.0,24030.0,202500.0,Family,Commercial associate,Secondary / secondary special,Married,House / apartment,0.028663,1,1,0,1,0,0,Core staff,2.0,2,2,MONDAY,13,0,0,0,0,0,0,Bank,0.390905,0.495743,0.138513,,,,,,,,,,,,,,,,,,,,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,3.0,260741,410.833333,6.233333,39.866667,150.600000,62.066667,1,Working day,-1.0
61500,Cash loans,M,N,Y,0,121500.0,254700.0,30357.0,225000.0,Family,Pensioner,Secondary / secondary special,Married,House / apartment,0.003818,1,0,0,1,0,0,,2.0,2,2,SATURDAY,4,0,0,0,0,0,0,,,0.591947,0.742182,0.0866,0.0833,0.9856,,,,0.2069,0.1667,,,,0.0510,,,,,block of flats,Block,No,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,284794,787.633333,12174.766667,79.200000,142.000000,52.900000,1,Weekend,-1.0


In [None]:
# income ratios
appl["CREDIT_BY_INCOME"]      = appl["AMT_CREDIT"]      / appl["AMT_INCOME_TOTAL"]
appl["ANNUITY_BY_INCOME"]     = appl["AMT_ANNUITY"]     / appl["AMT_INCOME_TOTAL"]
appl["GOODS_PRICE_BY_INCOME"] = appl["AMT_GOODS_PRICE"] / appl["AMT_INCOME_TOTAL"]
appl["INCOME_PER_PERSON"]     = appl["AMT_INCOME_TOTAL"] / appl["CNT_FAM_MEMBERS"]


In [None]:
# number of adults
appl["CNT_ADULTS"] = appl["CNT_FAM_MEMBERS"] - appl["CNT_CHILDREN"]
appl['CHILDREN_RATIO'] = appl['CNT_CHILDREN'] / appl['CNT_FAM_MEMBERS']
# number of overall payments
appl['ANNUITY LENGTH'] = appl['AMT_CREDIT'] / appl['AMT_ANNUITY']
# external sources
appl["EXT_SOURCE_MEAN"] = appl[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].mean(axis = 1)
appl["NUM_EXT_SOURCES"] = 3 - (appl["EXT_SOURCE_1"].isnull().astype(int) + appl["EXT_SOURCE_2"].isnull().astype(int) + appl["EXT_SOURCE_3"].isnull().astype(int))


In [None]:
def ratio_fix(target_column, original_column, df):
    ratio_column = df[target_column] / df[original_column]
    df.loc[ratio_column >= 1, target_column] = np.nan
    average_ratio = ratio_column[ratio_column < 1].mean()
    df.loc[df[target_column].isnull(), target_column] = df.loc[df[target_column].isnull(), original_column] * average_ratio
    return df

for i in ["CONVERTED_DAYS_EMPLOYED", "CONVERTED_DAYS_LAST_PHONE_CHANGE", "CONVERTED_DAYS_REGISTRATION", "CONVERTED_DAYS_ID_PUBLISH", "OWN_CAR_AGE"]:
    appl = ratio_fix(i, 'CONVERTED_DAYS_BIRTH', appl)


In [None]:
appl[["CONVERTED_DAYS_EMPLOYED", "CONVERTED_DAYS_LAST_PHONE_CHANGE", "CONVERTED_DAYS_REGISTRATION", "CONVERTED_DAYS_ID_PUBLISH", "OWN_CAR_AGE"]]

Unnamed: 0,CONVERTED_DAYS_EMPLOYED,CONVERTED_DAYS_LAST_PHONE_CHANGE,CONVERTED_DAYS_REGISTRATION,CONVERTED_DAYS_ID_PUBLISH,OWN_CAR_AGE
0,39.600000,27.600000,39.533333,9.700000,-1.000000
1,101.300000,20.566667,327.766667,81.233333,-1.000000
2,101.266667,36.866667,143.700000,115.266667,-1.000000
3,52.933333,84.533333,165.666667,15.900000,-1.000000
4,14.966667,35.666667,153.233333,79.300000,-1.000000
...,...,...,...,...,...
61497,124.406834,53.766667,125.933333,182.833333,-1.000000
61498,69.900000,54.600000,177.833333,71.766667,-1.241391
61499,6.233333,62.066667,39.866667,150.600000,-1.000000
61500,123.549325,52.900000,79.200000,142.000000,-1.000000


## Data Transformation

In [None]:
appl.shape

(307511, 89)

In [None]:
#Train_test Split somehow
# partitioning
train = appl[appl["SK_ID_CURR"].isin(y["SK_ID_CURR"]) == True]
test  = appl[appl["SK_ID_CURR"].isin(y["SK_ID_CURR"]) == False]

In [None]:
train.shape

(246009, 89)

In [None]:
test.shape

(61502, 89)

In [None]:
# NAME_TYPE_SUITE Gộp other_A với other_B vào One hot 
# NAME_INCOME_TYPE #Có nên gộp ko ta,k thì để ntn cx đc tùy
#Có mấy cái binary chắc p mapping nma tau lười quá tính sau đi
train = encode_column(train , ['NAME_CONTRACT_TYPE','CODE_GENDER','FLAG_OWN_CAR','FLAG_OWN_REALTY','EMERGENCYSTATE_MODE','DAY_APPR_PROCESS_START'], encoding_type='label')
train = encode_column(train, ['NAME_TYPE_SUITE','NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE','NAME_FAMILY_STATUS','NAME_HOUSING_TYPE','OCCUPATION_TYPE','ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE'], encoding_type='onehot')
train = create_logs(train, ["AMT_CREDIT", "AMT_INCOME_TOTAL", "AMT_GOODS_PRICE", "AMT_ANNUITY"], replace = True)


In [None]:
train.head()

Unnamed: 0,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,YEARS_BUILD_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,SK_ID_CURR,CONVERTED_DAYS_BIRTH,CONVERTED_DAYS_EMPLOYED,CONVERTED_DAYS_REGISTRATION,CONVERTED_DAYS_ID_PUBLISH,CONVERTED_DAYS_LAST_PHONE_CHANGE,NUM_DOCUMENTS,DAY_APPR_PROCESS_START,OWN_CAR_AGE,CREDIT_BY_INCOME,ANNUITY_BY_INCOME,GOODS_PRICE_BY_INCOME,INCOME_PER_PERSON,PERCENT_WORKED,CNT_ADULTS,CHILDREN_RATIO,ANNUITY LENGTH,EXT_SOURCE_MEAN,NUM_EXT_SOURCES,OWN_CAR_AGE_RATIO,DAYS_ID_PUBLISHED_RATIO,DAYS_REGISTRATION_RATIO,DAYS_LAST_PHONE_CHANGE_RATIO,NAME_TYPE_SUITE_Children,NAME_TYPE_SUITE_Family,NAME_TYPE_SUITE_Group of people,NAME_TYPE_SUITE_Other_A,NAME_TYPE_SUITE_Other_B,"NAME_TYPE_SUITE_Spouse, partner",NAME_TYPE_SUITE_Unaccompanied,NAME_INCOME_TYPE_Businessman,NAME_INCOME_TYPE_Commercial associate,NAME_INCOME_TYPE_Maternity leave,NAME_INCOME_TYPE_Pensioner,NAME_INCOME_TYPE_State servant,NAME_INCOME_TYPE_Student,NAME_INCOME_TYPE_Unemployed,NAME_INCOME_TYPE_Working,NAME_EDUCATION_TYPE_Academic degree,NAME_EDUCATION_TYPE_Higher education,NAME_EDUCATION_TYPE_Incomplete higher,NAME_EDUCATION_TYPE_Lower secondary,NAME_EDUCATION_TYPE_Secondary / secondary special,NAME_FAMILY_STATUS_Civil marriage,NAME_FAMILY_STATUS_Married,NAME_FAMILY_STATUS_Separated,NAME_FAMILY_STATUS_Single / not married,NAME_FAMILY_STATUS_Widow,NAME_HOUSING_TYPE_Co-op apartment,NAME_HOUSING_TYPE_House / apartment,NAME_HOUSING_TYPE_Municipal apartment,NAME_HOUSING_TYPE_Office apartment,NAME_HOUSING_TYPE_Rented apartment,NAME_HOUSING_TYPE_With parents,OCCUPATION_TYPE_Accountants,OCCUPATION_TYPE_Cleaning staff,OCCUPATION_TYPE_Cooking staff,OCCUPATION_TYPE_Core staff,OCCUPATION_TYPE_Drivers,OCCUPATION_TYPE_HR staff,OCCUPATION_TYPE_High skill tech staff,OCCUPATION_TYPE_IT staff,OCCUPATION_TYPE_Laborers,OCCUPATION_TYPE_Low-skill Laborers,OCCUPATION_TYPE_Managers,OCCUPATION_TYPE_Medicine staff,OCCUPATION_TYPE_Private service staff,OCCUPATION_TYPE_Realty agents,OCCUPATION_TYPE_Sales staff,OCCUPATION_TYPE_Secretaries,OCCUPATION_TYPE_Security staff,OCCUPATION_TYPE_Waiters/barmen staff,ORGANIZATION_TYPE_Advertising,ORGANIZATION_TYPE_Agriculture,ORGANIZATION_TYPE_Bank,ORGANIZATION_TYPE_Business Entity Type 1,ORGANIZATION_TYPE_Business Entity Type 2,ORGANIZATION_TYPE_Business Entity Type 3,ORGANIZATION_TYPE_Cleaning,ORGANIZATION_TYPE_Construction,ORGANIZATION_TYPE_Culture,ORGANIZATION_TYPE_Electricity,ORGANIZATION_TYPE_Emergency,ORGANIZATION_TYPE_Government,ORGANIZATION_TYPE_Hotel,ORGANIZATION_TYPE_Housing,ORGANIZATION_TYPE_Industry: type 1,ORGANIZATION_TYPE_Industry: type 10,ORGANIZATION_TYPE_Industry: type 11,ORGANIZATION_TYPE_Industry: type 12,ORGANIZATION_TYPE_Industry: type 13,ORGANIZATION_TYPE_Industry: type 2,ORGANIZATION_TYPE_Industry: type 3,ORGANIZATION_TYPE_Industry: type 4,ORGANIZATION_TYPE_Industry: type 5,ORGANIZATION_TYPE_Industry: type 6,ORGANIZATION_TYPE_Industry: type 7,ORGANIZATION_TYPE_Industry: type 8,ORGANIZATION_TYPE_Industry: type 9,ORGANIZATION_TYPE_Insurance,ORGANIZATION_TYPE_Kindergarten,ORGANIZATION_TYPE_Legal Services,ORGANIZATION_TYPE_Medicine,ORGANIZATION_TYPE_Military,ORGANIZATION_TYPE_Mobile,ORGANIZATION_TYPE_Other,ORGANIZATION_TYPE_Police,ORGANIZATION_TYPE_Postal,ORGANIZATION_TYPE_Realtor,ORGANIZATION_TYPE_Religion,ORGANIZATION_TYPE_Restaurant,ORGANIZATION_TYPE_School,ORGANIZATION_TYPE_Security,ORGANIZATION_TYPE_Security Ministries,ORGANIZATION_TYPE_Self-employed,ORGANIZATION_TYPE_Services,ORGANIZATION_TYPE_Telecom,ORGANIZATION_TYPE_Trade: type 1,ORGANIZATION_TYPE_Trade: type 2,ORGANIZATION_TYPE_Trade: type 3,ORGANIZATION_TYPE_Trade: type 4,ORGANIZATION_TYPE_Trade: type 5,ORGANIZATION_TYPE_Trade: type 6,ORGANIZATION_TYPE_Trade: type 7,ORGANIZATION_TYPE_Transport: type 1,ORGANIZATION_TYPE_Transport: type 2,ORGANIZATION_TYPE_Transport: type 3,ORGANIZATION_TYPE_Transport: type 4,ORGANIZATION_TYPE_University,FONDKAPREMONT_MODE_not specified,FONDKAPREMONT_MODE_org spec account,FONDKAPREMONT_MODE_reg oper account,FONDKAPREMONT_MODE_reg oper spec account,HOUSETYPE_MODE_block of flats,HOUSETYPE_MODE_specific housing,HOUSETYPE_MODE_terraced house,WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden
0,0,0,0,0,0,12.506181,14.072865,10.482892,13.937287,0.003541,1,1,0,1,1,0,2.0,1,1,MONDAY,11,0,0,0,0,0,0,0.311267,0.622246,,0.0959,0.0529,0.9851,0.796,0.0605,0.08,0.0345,0.2917,0.3333,0.013,0.0773,0.0549,0.0039,0.0098,0.804,0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,278621,558.833333,39.6,39.533333,9.7,27.6,1,1,-1.0,4.79075,0.132217,4.183333,135000.0,0.070862,2.0,0.0,36.234085,0.466757,2,-0.001789,0.017358,0.070743,0.049389,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,True,False,False,False,False,False,False
1,0,0,0,1,0,11.813037,12.652947,10.298481,12.601491,0.008019,1,1,0,1,0,0,2.0,2,2,WEDNESDAY,17,0,0,0,0,0,0,,0.650442,,,,,,,,,,,,,,,,,2,2.0,0.0,2.0,0.0,,,,,,,139008,633.5,101.3,327.766667,81.233333,20.566667,1,1,-1.0,2.316167,0.2199,2.2,67500.0,0.159905,2.0,0.0,10.532818,0.650442,1,-0.001579,0.128229,0.51739,0.032465,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,True,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,0,1,0,1,0,11.707678,13.148033,9.992711,13.148033,0.028663,1,1,0,1,0,0,1.0,2,2,THURSDAY,11,0,0,0,0,1,1,,0.322738,,,,,,,,,,,,,,,,,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,138348,664.4,101.266667,143.7,115.266667,36.866667,1,1,-1.0,4.222222,0.179963,4.222222,121500.0,0.152418,1.0,0.0,23.461618,0.322738,1,-0.001505,0.17349,0.216285,0.055489,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,True,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,0,1,0,1,0,11.502885,13.103173,10.222614,13.026955,0.035792,1,1,1,1,1,0,2.0,2,2,WEDNESDAY,16,0,0,0,0,0,0,,0.354225,0.621226,,,,,,,,,,,,,,,,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,64140,564.7,52.933333,165.666667,15.9,84.533333,1,1,-1.0,4.9545,0.277955,4.590909,49500.0,0.093737,2.0,0.0,17.824857,0.487726,2,-0.001771,0.028157,0.293371,0.149696,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,0,1,1,1,0,12.793862,14.240779,10.647233,14.240779,0.003122,1,1,1,1,0,0,2.0,3,3,MONDAY,16,0,0,0,0,1,1,,0.714279,0.540654,,,,,,,,,,,,,,,,2,2.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,219374,628.333333,14.966667,153.233333,79.3,35.666667,1,1,-1.0,4.25,0.116875,4.25,180000.0,0.02382,2.0,0.0,36.363636,0.627467,2,-0.001592,0.126207,0.243873,0.056764,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,True,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [None]:
test.head()

Unnamed: 0,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,YEARS_BUILD_MODE,FONDKAPREMONT_MODE,HOUSETYPE_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,SK_ID_CURR,CONVERTED_DAYS_BIRTH,CONVERTED_DAYS_EMPLOYED,CONVERTED_DAYS_REGISTRATION,CONVERTED_DAYS_ID_PUBLISH,CONVERTED_DAYS_LAST_PHONE_CHANGE,NUM_DOCUMENTS,DAY_APPR_PROCESS_START,OWN_CAR_AGE,CREDIT_BY_INCOME,ANNUITY_BY_INCOME,GOODS_PRICE_BY_INCOME,INCOME_PER_PERSON,PERCENT_WORKED,CNT_ADULTS,CHILDREN_RATIO,ANNUITY LENGTH,EXT_SOURCE_MEAN,NUM_EXT_SOURCES,OWN_CAR_AGE_RATIO,DAYS_ID_PUBLISHED_RATIO,DAYS_REGISTRATION_RATIO,DAYS_LAST_PHONE_CHANGE_RATIO
0,Cash loans,M,Y,N,2,207000.0,465457.5,52641.0,418500.0,Unaccompanied,Commercial associate,Secondary / secondary special,Married,House / apartment,0.00963,1,1,0,1,0,0,Sales staff,4.0,2,2,THURSDAY,11,0,0,0,0,1,1,Business Entity Type 3,0.675878,0.604894,0.000527,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,83659,443.233333,25.4,21.233333,143.566667,0.066667,1,Working day,-1.0,2.248587,0.254304,2.021739,51750.0,0.057306,2.0,0.5,8.84211,0.4271,3,-0.002256,0.323908,0.047906,0.00015
1,Cash loans,F,Y,Y,0,247500.0,1281712.5,48946.5,1179000.0,Unaccompanied,Commercial associate,Higher education,Single / not married,House / apartment,0.006852,1,1,0,1,0,1,Managers,1.0,3,3,THURSDAY,10,0,0,0,0,0,0,Business Entity Type 3,0.430827,0.425351,0.712155,0.0753,0.0568,0.997,0.9592,0.1326,0.08,0.0517,0.4167,0.2917,0.0735,0.0601,0.0844,0.0058,0.1118,0.9216,reg oper account,block of flats,Monolithic,No,2.0,0.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0,174814,492.6,38.033333,53.666667,151.533333,35.7,1,Working day,-1.0,5.178636,0.197764,4.763636,247500.0,0.077209,1.0,0.0,26.185989,0.522778,3,-0.00203,0.307619,0.108946,0.072473
2,Cash loans,F,Y,N,0,202500.0,495000.0,39109.5,495000.0,Unaccompanied,Working,Secondary / secondary special,Married,House / apartment,0.035792,1,1,1,1,0,0,Sales staff,2.0,2,2,TUESDAY,16,0,0,0,0,0,0,Self-employed,0.527239,0.53176,0.207964,,,,,,,,,,,,,,,,,,,,5.0,0.0,5.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0,179486,596.9,21.3,83.566667,48.7,47.833333,1,Working day,-1.0,2.444444,0.193133,2.444444,101250.0,0.035684,2.0,0.0,12.656771,0.422321,3,-0.001675,0.081588,0.140001,0.080136
3,Cash loans,F,N,Y,0,247500.0,254700.0,24939.0,225000.0,Unaccompanied,State servant,Secondary / secondary special,Widow,House / apartment,0.04622,1,1,0,1,0,0,High skill tech staff,1.0,1,1,FRIDAY,14,0,0,0,0,0,0,Business Entity Type 3,,0.693521,0.614414,0.132,0.0645,0.9846,,,0.16,0.069,0.625,,,,0.1628,,0.0022,,,,Panel,No,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,57038,654.2,232.733333,372.233333,105.266667,66.666667,1,Working day,-1.0,1.029091,0.100764,0.909091,247500.0,0.355753,1.0,0.0,10.21292,0.653968,2,-0.001529,0.160909,0.56899,0.101906
4,Cash loans,M,N,Y,0,112500.0,308133.0,15862.5,234000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.01885,1,1,0,1,0,0,Laborers,1.0,2,2,WEDNESDAY,11,0,0,0,0,0,0,Business Entity Type 3,0.654882,0.56069,0.636376,0.0619,0.0553,0.9717,,,0.0,0.1724,0.1667,,0.0866,,0.0749,,0.0149,,,block of flats,"Stone, brick",No,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,25672,677.566667,36.833333,243.3,16.466667,5.766667,1,Working day,-1.0,2.73896,0.141,2.08,112500.0,0.054361,1.0,0.0,19.425248,0.617316,3,-0.001476,0.024303,0.359079,0.008511


In [None]:
test = create_logs(test, ["AMT_CREDIT", "AMT_INCOME_TOTAL", "AMT_GOODS_PRICE", "AMT_ANNUITY"], replace = True)

# Có thể normalize

In [None]:
# SỬA LẠI 'XNA' SAU

mapping = {
    'Cash loans': 0,
    'Revolving loans': 1,
    'Customer loans': 2,
    'XNA': 3,
    'F': 0,
    'M': 1,
    'Y': 1,
    'N': 0,
    'No': 0,
    'Yes': 1,
    'Working day': 1,
    'Weekend': 0
}

columns_to_map = ['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'EMERGENCYSTATE_MODE', 'DAY_APPR_PROCESS_START']

for col in columns_to_map:
    test.loc[:, col] = test[col].map(mapping).fillna(test[col])


  test.loc[:, col] = test[col].map(mapping).fillna(test[col])


In [None]:
test = encode_column(test, ['NAME_TYPE_SUITE','NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE','NAME_FAMILY_STATUS','NAME_HOUSING_TYPE','OCCUPATION_TYPE','ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE'], encoding_type='onehot')

In [None]:
# Đảm bảo test và train có cùng các cột
train_columns = train.columns
test_columns = test.columns

# Thêm các cột thiếu vào test, điền giá trị 0
for col in train_columns:
    if col not in test_columns:
        test[col] = 0

# Đảm bảo thứ tự các cột trong test giống train
test = test[train_columns]

In [None]:
# # Rename features (giữ lại cột 'SK_ID_CURR' không thay đổi)
# train.columns = ["app_" + str(col) if col != "SK_ID_CURR" else str(col) for col in train.columns]
# test.columns = ["app_" + str(col) if col != "SK_ID_CURR" else str(col) for col in test.columns]

In [None]:
train.head()

Unnamed: 0,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,YEARS_BUILD_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,SK_ID_CURR,CONVERTED_DAYS_BIRTH,CONVERTED_DAYS_EMPLOYED,CONVERTED_DAYS_REGISTRATION,CONVERTED_DAYS_ID_PUBLISH,CONVERTED_DAYS_LAST_PHONE_CHANGE,NUM_DOCUMENTS,DAY_APPR_PROCESS_START,OWN_CAR_AGE,CREDIT_BY_INCOME,ANNUITY_BY_INCOME,GOODS_PRICE_BY_INCOME,INCOME_PER_PERSON,PERCENT_WORKED,CNT_ADULTS,CHILDREN_RATIO,ANNUITY LENGTH,EXT_SOURCE_MEAN,NUM_EXT_SOURCES,OWN_CAR_AGE_RATIO,DAYS_ID_PUBLISHED_RATIO,DAYS_REGISTRATION_RATIO,DAYS_LAST_PHONE_CHANGE_RATIO,NAME_TYPE_SUITE_Children,NAME_TYPE_SUITE_Family,NAME_TYPE_SUITE_Group of people,NAME_TYPE_SUITE_Other_A,NAME_TYPE_SUITE_Other_B,"NAME_TYPE_SUITE_Spouse, partner",NAME_TYPE_SUITE_Unaccompanied,NAME_INCOME_TYPE_Businessman,NAME_INCOME_TYPE_Commercial associate,NAME_INCOME_TYPE_Maternity leave,NAME_INCOME_TYPE_Pensioner,NAME_INCOME_TYPE_State servant,NAME_INCOME_TYPE_Student,NAME_INCOME_TYPE_Unemployed,NAME_INCOME_TYPE_Working,NAME_EDUCATION_TYPE_Academic degree,NAME_EDUCATION_TYPE_Higher education,NAME_EDUCATION_TYPE_Incomplete higher,NAME_EDUCATION_TYPE_Lower secondary,NAME_EDUCATION_TYPE_Secondary / secondary special,NAME_FAMILY_STATUS_Civil marriage,NAME_FAMILY_STATUS_Married,NAME_FAMILY_STATUS_Separated,NAME_FAMILY_STATUS_Single / not married,NAME_FAMILY_STATUS_Widow,NAME_HOUSING_TYPE_Co-op apartment,NAME_HOUSING_TYPE_House / apartment,NAME_HOUSING_TYPE_Municipal apartment,NAME_HOUSING_TYPE_Office apartment,NAME_HOUSING_TYPE_Rented apartment,NAME_HOUSING_TYPE_With parents,OCCUPATION_TYPE_Accountants,OCCUPATION_TYPE_Cleaning staff,OCCUPATION_TYPE_Cooking staff,OCCUPATION_TYPE_Core staff,OCCUPATION_TYPE_Drivers,OCCUPATION_TYPE_HR staff,OCCUPATION_TYPE_High skill tech staff,OCCUPATION_TYPE_IT staff,OCCUPATION_TYPE_Laborers,OCCUPATION_TYPE_Low-skill Laborers,OCCUPATION_TYPE_Managers,OCCUPATION_TYPE_Medicine staff,OCCUPATION_TYPE_Private service staff,OCCUPATION_TYPE_Realty agents,OCCUPATION_TYPE_Sales staff,OCCUPATION_TYPE_Secretaries,OCCUPATION_TYPE_Security staff,OCCUPATION_TYPE_Waiters/barmen staff,ORGANIZATION_TYPE_Advertising,ORGANIZATION_TYPE_Agriculture,ORGANIZATION_TYPE_Bank,ORGANIZATION_TYPE_Business Entity Type 1,ORGANIZATION_TYPE_Business Entity Type 2,ORGANIZATION_TYPE_Business Entity Type 3,ORGANIZATION_TYPE_Cleaning,ORGANIZATION_TYPE_Construction,ORGANIZATION_TYPE_Culture,ORGANIZATION_TYPE_Electricity,ORGANIZATION_TYPE_Emergency,ORGANIZATION_TYPE_Government,ORGANIZATION_TYPE_Hotel,ORGANIZATION_TYPE_Housing,ORGANIZATION_TYPE_Industry: type 1,ORGANIZATION_TYPE_Industry: type 10,ORGANIZATION_TYPE_Industry: type 11,ORGANIZATION_TYPE_Industry: type 12,ORGANIZATION_TYPE_Industry: type 13,ORGANIZATION_TYPE_Industry: type 2,ORGANIZATION_TYPE_Industry: type 3,ORGANIZATION_TYPE_Industry: type 4,ORGANIZATION_TYPE_Industry: type 5,ORGANIZATION_TYPE_Industry: type 6,ORGANIZATION_TYPE_Industry: type 7,ORGANIZATION_TYPE_Industry: type 8,ORGANIZATION_TYPE_Industry: type 9,ORGANIZATION_TYPE_Insurance,ORGANIZATION_TYPE_Kindergarten,ORGANIZATION_TYPE_Legal Services,ORGANIZATION_TYPE_Medicine,ORGANIZATION_TYPE_Military,ORGANIZATION_TYPE_Mobile,ORGANIZATION_TYPE_Other,ORGANIZATION_TYPE_Police,ORGANIZATION_TYPE_Postal,ORGANIZATION_TYPE_Realtor,ORGANIZATION_TYPE_Religion,ORGANIZATION_TYPE_Restaurant,ORGANIZATION_TYPE_School,ORGANIZATION_TYPE_Security,ORGANIZATION_TYPE_Security Ministries,ORGANIZATION_TYPE_Self-employed,ORGANIZATION_TYPE_Services,ORGANIZATION_TYPE_Telecom,ORGANIZATION_TYPE_Trade: type 1,ORGANIZATION_TYPE_Trade: type 2,ORGANIZATION_TYPE_Trade: type 3,ORGANIZATION_TYPE_Trade: type 4,ORGANIZATION_TYPE_Trade: type 5,ORGANIZATION_TYPE_Trade: type 6,ORGANIZATION_TYPE_Trade: type 7,ORGANIZATION_TYPE_Transport: type 1,ORGANIZATION_TYPE_Transport: type 2,ORGANIZATION_TYPE_Transport: type 3,ORGANIZATION_TYPE_Transport: type 4,ORGANIZATION_TYPE_University,FONDKAPREMONT_MODE_not specified,FONDKAPREMONT_MODE_org spec account,FONDKAPREMONT_MODE_reg oper account,FONDKAPREMONT_MODE_reg oper spec account,HOUSETYPE_MODE_block of flats,HOUSETYPE_MODE_specific housing,HOUSETYPE_MODE_terraced house,WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden
0,0,0,0,0,0,12.506181,14.072865,10.482892,13.937287,0.003541,1,1,0,1,1,0,2.0,1,1,MONDAY,11,0,0,0,0,0,0,0.311267,0.622246,,0.0959,0.0529,0.9851,0.796,0.0605,0.08,0.0345,0.2917,0.3333,0.013,0.0773,0.0549,0.0039,0.0098,0.804,0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,278621,558.833333,39.6,39.533333,9.7,27.6,1,1,-1.0,4.79075,0.132217,4.183333,135000.0,0.070862,2.0,0.0,36.234085,0.466757,2,-0.001789,0.017358,0.070743,0.049389,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,True,False,False,False,False,False,False
1,0,0,0,1,0,11.813037,12.652947,10.298481,12.601491,0.008019,1,1,0,1,0,0,2.0,2,2,WEDNESDAY,17,0,0,0,0,0,0,,0.650442,,,,,,,,,,,,,,,,,2,2.0,0.0,2.0,0.0,,,,,,,139008,633.5,101.3,327.766667,81.233333,20.566667,1,1,-1.0,2.316167,0.2199,2.2,67500.0,0.159905,2.0,0.0,10.532818,0.650442,1,-0.001579,0.128229,0.51739,0.032465,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,True,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,0,1,0,1,0,11.707678,13.148033,9.992711,13.148033,0.028663,1,1,0,1,0,0,1.0,2,2,THURSDAY,11,0,0,0,0,1,1,,0.322738,,,,,,,,,,,,,,,,,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,138348,664.4,101.266667,143.7,115.266667,36.866667,1,1,-1.0,4.222222,0.179963,4.222222,121500.0,0.152418,1.0,0.0,23.461618,0.322738,1,-0.001505,0.17349,0.216285,0.055489,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,True,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,0,1,0,1,0,11.502885,13.103173,10.222614,13.026955,0.035792,1,1,1,1,1,0,2.0,2,2,WEDNESDAY,16,0,0,0,0,0,0,,0.354225,0.621226,,,,,,,,,,,,,,,,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,64140,564.7,52.933333,165.666667,15.9,84.533333,1,1,-1.0,4.9545,0.277955,4.590909,49500.0,0.093737,2.0,0.0,17.824857,0.487726,2,-0.001771,0.028157,0.293371,0.149696,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,0,1,1,1,0,12.793862,14.240779,10.647233,14.240779,0.003122,1,1,1,1,0,0,2.0,3,3,MONDAY,16,0,0,0,0,1,1,,0.714279,0.540654,,,,,,,,,,,,,,,,2,2.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,219374,628.333333,14.966667,153.233333,79.3,35.666667,1,1,-1.0,4.25,0.116875,4.25,180000.0,0.02382,2.0,0.0,36.363636,0.627467,2,-0.001592,0.126207,0.243873,0.056764,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,True,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [None]:
test.head()

Unnamed: 0,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,YEARS_BUILD_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,SK_ID_CURR,CONVERTED_DAYS_BIRTH,CONVERTED_DAYS_EMPLOYED,CONVERTED_DAYS_REGISTRATION,CONVERTED_DAYS_ID_PUBLISH,CONVERTED_DAYS_LAST_PHONE_CHANGE,NUM_DOCUMENTS,DAY_APPR_PROCESS_START,OWN_CAR_AGE,CREDIT_BY_INCOME,ANNUITY_BY_INCOME,GOODS_PRICE_BY_INCOME,INCOME_PER_PERSON,PERCENT_WORKED,CNT_ADULTS,CHILDREN_RATIO,ANNUITY LENGTH,EXT_SOURCE_MEAN,NUM_EXT_SOURCES,OWN_CAR_AGE_RATIO,DAYS_ID_PUBLISHED_RATIO,DAYS_REGISTRATION_RATIO,DAYS_LAST_PHONE_CHANGE_RATIO,NAME_TYPE_SUITE_Children,NAME_TYPE_SUITE_Family,NAME_TYPE_SUITE_Group of people,NAME_TYPE_SUITE_Other_A,NAME_TYPE_SUITE_Other_B,"NAME_TYPE_SUITE_Spouse, partner",NAME_TYPE_SUITE_Unaccompanied,NAME_INCOME_TYPE_Businessman,NAME_INCOME_TYPE_Commercial associate,NAME_INCOME_TYPE_Maternity leave,NAME_INCOME_TYPE_Pensioner,NAME_INCOME_TYPE_State servant,NAME_INCOME_TYPE_Student,NAME_INCOME_TYPE_Unemployed,NAME_INCOME_TYPE_Working,NAME_EDUCATION_TYPE_Academic degree,NAME_EDUCATION_TYPE_Higher education,NAME_EDUCATION_TYPE_Incomplete higher,NAME_EDUCATION_TYPE_Lower secondary,NAME_EDUCATION_TYPE_Secondary / secondary special,NAME_FAMILY_STATUS_Civil marriage,NAME_FAMILY_STATUS_Married,NAME_FAMILY_STATUS_Separated,NAME_FAMILY_STATUS_Single / not married,NAME_FAMILY_STATUS_Widow,NAME_HOUSING_TYPE_Co-op apartment,NAME_HOUSING_TYPE_House / apartment,NAME_HOUSING_TYPE_Municipal apartment,NAME_HOUSING_TYPE_Office apartment,NAME_HOUSING_TYPE_Rented apartment,NAME_HOUSING_TYPE_With parents,OCCUPATION_TYPE_Accountants,OCCUPATION_TYPE_Cleaning staff,OCCUPATION_TYPE_Cooking staff,OCCUPATION_TYPE_Core staff,OCCUPATION_TYPE_Drivers,OCCUPATION_TYPE_HR staff,OCCUPATION_TYPE_High skill tech staff,OCCUPATION_TYPE_IT staff,OCCUPATION_TYPE_Laborers,OCCUPATION_TYPE_Low-skill Laborers,OCCUPATION_TYPE_Managers,OCCUPATION_TYPE_Medicine staff,OCCUPATION_TYPE_Private service staff,OCCUPATION_TYPE_Realty agents,OCCUPATION_TYPE_Sales staff,OCCUPATION_TYPE_Secretaries,OCCUPATION_TYPE_Security staff,OCCUPATION_TYPE_Waiters/barmen staff,ORGANIZATION_TYPE_Advertising,ORGANIZATION_TYPE_Agriculture,ORGANIZATION_TYPE_Bank,ORGANIZATION_TYPE_Business Entity Type 1,ORGANIZATION_TYPE_Business Entity Type 2,ORGANIZATION_TYPE_Business Entity Type 3,ORGANIZATION_TYPE_Cleaning,ORGANIZATION_TYPE_Construction,ORGANIZATION_TYPE_Culture,ORGANIZATION_TYPE_Electricity,ORGANIZATION_TYPE_Emergency,ORGANIZATION_TYPE_Government,ORGANIZATION_TYPE_Hotel,ORGANIZATION_TYPE_Housing,ORGANIZATION_TYPE_Industry: type 1,ORGANIZATION_TYPE_Industry: type 10,ORGANIZATION_TYPE_Industry: type 11,ORGANIZATION_TYPE_Industry: type 12,ORGANIZATION_TYPE_Industry: type 13,ORGANIZATION_TYPE_Industry: type 2,ORGANIZATION_TYPE_Industry: type 3,ORGANIZATION_TYPE_Industry: type 4,ORGANIZATION_TYPE_Industry: type 5,ORGANIZATION_TYPE_Industry: type 6,ORGANIZATION_TYPE_Industry: type 7,ORGANIZATION_TYPE_Industry: type 8,ORGANIZATION_TYPE_Industry: type 9,ORGANIZATION_TYPE_Insurance,ORGANIZATION_TYPE_Kindergarten,ORGANIZATION_TYPE_Legal Services,ORGANIZATION_TYPE_Medicine,ORGANIZATION_TYPE_Military,ORGANIZATION_TYPE_Mobile,ORGANIZATION_TYPE_Other,ORGANIZATION_TYPE_Police,ORGANIZATION_TYPE_Postal,ORGANIZATION_TYPE_Realtor,ORGANIZATION_TYPE_Religion,ORGANIZATION_TYPE_Restaurant,ORGANIZATION_TYPE_School,ORGANIZATION_TYPE_Security,ORGANIZATION_TYPE_Security Ministries,ORGANIZATION_TYPE_Self-employed,ORGANIZATION_TYPE_Services,ORGANIZATION_TYPE_Telecom,ORGANIZATION_TYPE_Trade: type 1,ORGANIZATION_TYPE_Trade: type 2,ORGANIZATION_TYPE_Trade: type 3,ORGANIZATION_TYPE_Trade: type 4,ORGANIZATION_TYPE_Trade: type 5,ORGANIZATION_TYPE_Trade: type 6,ORGANIZATION_TYPE_Trade: type 7,ORGANIZATION_TYPE_Transport: type 1,ORGANIZATION_TYPE_Transport: type 2,ORGANIZATION_TYPE_Transport: type 3,ORGANIZATION_TYPE_Transport: type 4,ORGANIZATION_TYPE_University,FONDKAPREMONT_MODE_not specified,FONDKAPREMONT_MODE_org spec account,FONDKAPREMONT_MODE_reg oper account,FONDKAPREMONT_MODE_reg oper spec account,HOUSETYPE_MODE_block of flats,HOUSETYPE_MODE_specific housing,HOUSETYPE_MODE_terraced house,WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden
0,0,1,1,0,2,12.240479,13.050778,10.87127,12.944435,0.00963,1,1,0,1,0,0,4.0,2,2,THURSDAY,11,0,0,0,0,1,1,0.675878,0.604894,0.000527,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,83659,443.233333,25.4,21.233333,143.566667,0.066667,1,1,-1.0,2.248587,0.254304,2.021739,51750.0,0.057306,2.0,0.5,8.84211,0.4271,3,-0.002256,0.323908,0.047906,0.00015,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,0,0,1,1,0,12.41917,14.063708,10.798504,13.980178,0.006852,1,1,0,1,0,1,1.0,3,3,THURSDAY,10,0,0,0,0,0,0,0.430827,0.425351,0.712155,0.0753,0.0568,0.997,0.9592,0.1326,0.08,0.0517,0.4167,0.2917,0.0735,0.0601,0.0844,0.0058,0.1118,0.9216,0.0,2.0,0.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0,174814,492.6,38.033333,53.666667,151.533333,35.7,1,1,-1.0,5.178636,0.197764,4.763636,247500.0,0.077209,1.0,0.0,26.185989,0.522778,3,-0.00203,0.307619,0.108946,0.072473,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,True,False,False,False,False
2,0,0,1,0,0,12.2185,13.112315,10.574146,13.112315,0.035792,1,1,1,1,0,0,2.0,2,2,TUESDAY,16,0,0,0,0,0,0,0.527239,0.53176,0.207964,,,,,,,,,,,,,,,,,5.0,0.0,5.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0,179486,596.9,21.3,83.566667,48.7,47.833333,1,1,-1.0,2.444444,0.193133,2.444444,101250.0,0.035684,2.0,0.0,12.656771,0.422321,3,-0.001675,0.081588,0.140001,0.080136,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,True,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,0,0,0,1,0,12.41917,12.447846,10.124228,12.32386,0.04622,1,1,0,1,0,0,1.0,1,1,FRIDAY,14,0,0,0,0,0,0,,0.693521,0.614414,0.132,0.0645,0.9846,,,0.16,0.069,0.625,,,,0.1628,,0.0022,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,57038,654.2,232.733333,372.233333,105.266667,66.666667,1,1,-1.0,1.029091,0.100764,0.909091,247500.0,0.355753,1.0,0.0,10.21292,0.653968,2,-0.001529,0.160909,0.56899,0.101906,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,True,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False
4,0,1,0,1,0,11.630717,12.63829,9.671776,12.363081,0.01885,1,1,0,1,0,0,1.0,2,2,WEDNESDAY,11,0,0,0,0,0,0,0.654882,0.56069,0.636376,0.0619,0.0553,0.9717,,,0.0,0.1724,0.1667,,0.0866,,0.0749,,0.0149,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,25672,677.566667,36.833333,243.3,16.466667,5.766667,1,1,-1.0,2.73896,0.141,2.08,112500.0,0.054361,1.0,0.0,19.425248,0.617316,3,-0.001476,0.024303,0.359079,0.008511,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False


In [None]:
# rồi merge bảng thì cần bọn kia xong fillnull value sauk......
#fill missing value(dùng lib fill)
# Arrange lại cho đúng step by step để tối nói(sợ k kịp bê wa)
# EDA hướng đi cho HV