In [11]:
import pandas as pd
import polars as pl
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

ROOT_DIR = './home-credit-credit-risk-model-stability/'

# Utils to format the tables

In [6]:
def set_table_dtypes(df):
    # implement here all desired dtypes for tables
    # the following is just an example
    for col in df.columns:
        # last letter of column name will help you determine the type
        if col[-1] in ("P", "A"):
            df = df.with_columns(pl.col(col).cast(pl.Float64).alias(col))

    return df

def convert_strings(df):
    for col in df.columns:
        if df[col].dtype.name in ['object', 'string']:
            df[col] = df[col].astype("string").astype('category')
            current_categories = df[col].cat.categories
            new_categories = current_categories.to_list() + ["Unknown"]
            new_dtype = pd.CategoricalDtype(categories=new_categories, ordered=True)
            df[col] = df[col].astype(new_dtype)
    return df

# Train data concat by depth

In [8]:
train_base_table_depth0 = pl.read_csv(ROOT_DIR + "csv_files/train/train_base.csv")

In [10]:
train_appl_prev_depth1 = pl.concat(
    [
        pl.read_csv(ROOT_DIR + "csv_files/train/train_applprev_1_0.csv").pipe(set_table_dtypes),
        pl.read_csv(ROOT_DIR + "csv_files/train/train_applprev_1_1.csv").pipe(set_table_dtypes),
    ],
    how="vertical_relaxed",
)

train_appl_prev_depth2 = pl.read_csv(ROOT_DIR + "csv_files/train/train_applprev_2.csv").pipe(set_table_dtypes),

6525979
6525979


In [None]:
train_credit_bureau_depth1 = pl.concat(
    [
        pl.read_csv(ROOT_DIR + "csv_files/train/train_credit_bureau_a_1_0.csv").pipe(set_table_dtypes),
        pl.read_csv(ROOT_DIR + "csv_files/train/train_credit_bureau_a_1_1.csv").pipe(set_table_dtypes),
        pl.read_csv(ROOT_DIR + "csv_files/train/train_credit_bureau_a_1_2.csv").pipe(set_table_dtypes),
        pl.read_csv(ROOT_DIR + "csv_files/train/train_credit_bureau_a_1_3.csv").pipe(set_table_dtypes)
    ],
    how="vertical_relaxed",
)

In [None]:
train_credit_bureau_depth2 = pl.concat(
    [
        pl.read_csv(ROOT_DIR + "csv_files/train/train_credit_bureau_a_2_0.csv").pipe(set_table_dtypes),
        pl.read_csv(ROOT_DIR + "csv_files/train/train_credit_bureau_a_2_1.csv").pipe(set_table_dtypes),
        pl.read_csv(ROOT_DIR + "csv_files/train/train_credit_bureau_a_2_2.csv").pipe(set_table_dtypes),
        pl.read_csv(ROOT_DIR + "csv_files/train/train_credit_bureau_a_2_3.csv").pipe(set_table_dtypes),
        pl.read_csv(ROOT_DIR + "csv_files/train/train_credit_bureau_a_2_4.csv").pipe(set_table_dtypes),
        pl.read_csv(ROOT_DIR + "csv_files/train/train_credit_bureau_a_2_5.csv").pipe(set_table_dtypes),
        pl.read_csv(ROOT_DIR + "csv_files/train/train_credit_bureau_a_2_6.csv").pipe(set_table_dtypes),
        pl.read_csv(ROOT_DIR + "csv_files/train/train_credit_bureau_a_2_7.csv").pipe(set_table_dtypes),
        pl.read_csv(ROOT_DIR + "csv_files/train/train_credit_bureau_a_2_8.csv").pipe(set_table_dtypes),
        pl.read_csv(ROOT_DIR + "csv_files/train/train_credit_bureau_a_2_9.csv").pipe(set_table_dtypes),
        pl.read_csv(ROOT_DIR + "csv_files/train/train_credit_bureau_a_2_10.csv").pipe(set_table_dtypes),
    ],
    how="vertical_relaxed",
)

In [None]:
train_credit_bureau_b_depth1 = pl.read_csv(ROOT_DIR + "csv_files/train/train_credit_bureau_b_1.csv").pipe(set_table_dtypes)
train_credit_bureau_b_depth2 = pl.read_csv(ROOT_DIR + "csv_files/train/train_credit_bureau_b_2.csv").pipe(set_table_dtypes)

In [None]:
train_debit_card_depth1 = pl.read_csv(ROOT_DIR + "csv_files/train/train_debitcard_1.csv").pipe(set_table_dtypes)
train_deposit_depth1 = pl.read_csv(ROOT_DIR + "csv_files/train/train_deposit_1.csv").pipe(set_table_dtypes)
train_other_depth1 = pl.read_csv(ROOT_DIR + "csv_files/train/train_other_1.csv").pipe(set_table_dtypes)

train_person_depth1 = pl.read_csv(ROOT_DIR + "csv_files/train/train_person_1.csv").pipe(set_table_dtypes)
train_person_depth2 = pl.read_csv(ROOT_DIR + "csv_files/train/train_person_2.csv").pipe(set_table_dtypes)

In [None]:
train_static_depth0 = pl.concat(
    [
        pl.read_csv(ROOT_DIR + "csv_files/train/train_static_0_0.csv").pipe(set_table_dtypes),
        pl.read_csv(ROOT_DIR + "csv_files/train/train_static_0_1.csv").pipe(set_table_dtypes),
    ],
    how="vertical_relaxed",
)
train_static_cb_depth0 = pl.read_csv(ROOT_DIR + "csv_files/train/train_static_cb_0.csv").pipe(set_table_dtypes)

In [None]:
train_tax_registry_a_depth1 = pl.read_csv(ROOT_DIR + "csv_files/train/train_tax_registry_a_1.csv").pipe(set_table_dtypes),
train_tax_registry_b_depth1 = pl.read_csv(ROOT_DIR + "csv_files/train/train_tax_registry_b_1.csv").pipe(set_table_dtypes),
train_tax_registry_c_depth1 = pl.read_csv(ROOT_DIR + "csv_files/train/train_tax_registry_c_1.csv").pipe(set_table_dtypes),

In [None]:
train_base_table_depth0
train_appl_prev_depth1
train_appl_prev_depth2

train_credit_bureau_depth1
train_credit_bureau_depth2
train_credit_bureau_b_depth1
train_credit_bureau_b_depth2

train_debit_card_depth1
train_deposit_depth1
train_other_depth1
train_person_depth1
train_person_depth2

train_static_depth0
train_static_cb_depth0

train_tax_registry_a_depth1
train_tax_registry_b_depth1
train_tax_registry_c_depth1

# Test data concat by depth

In [None]:
test_base_table_depth0 = pl.read_csv(ROOT_DIR + "csv_files/test/test_base.csv")

In [None]:
test_appl_prev_depth1 = pl.concat(
    [
        pl.read_csv(ROOT_DIR + "csv_files/test/test_applprev_1_0.csv").pipe(set_table_dtypes),
        pl.read_csv(ROOT_DIR + "csv_files/test/test_applprev_1_1.csv").pipe(set_table_dtypes),
        pl.read_csv(ROOT_DIR + "csv_files/test/test_applprev_1_2.csv").pipe(set_table_dtypes),
    ],
    how="vertical_relaxed",
)

test_appl_prev_depth2 = pl.read_csv(ROOT_DIR + "csv_files/test/test_applprev_2.csv").pipe(set_table_dtypes),

In [None]:
test_credit_bureau_a_depth1 = pl.concat(
    [
        pl.read_csv(ROOT_DIR + "csv_files/test/test_credit_bureau_a_1_0.csv").pipe(set_table_dtypes),
        pl.read_csv(ROOT_DIR + "csv_files/test/test_credit_bureau_a_1_1.csv").pipe(set_table_dtypes),
        pl.read_csv(ROOT_DIR + "csv_files/test/test_credit_bureau_a_1_2.csv").pipe(set_table_dtypes),
        pl.read_csv(ROOT_DIR + "csv_files/test/test_credit_bureau_a_1_3.csv").pipe(set_table_dtypes),
        pl.read_csv(ROOT_DIR + "csv_files/test/test_credit_bureau_a_1_4.csv").pipe(set_table_dtypes)
    ],
    how="vertical_relaxed",
)

In [None]:
test_credit_bureau_a_depth2 = pl.concat(
    [
        pl.read_csv(ROOT_DIR + "csv_files/test/test_credit_bureau_a_2_0.csv").pipe(set_table_dtypes),
        pl.read_csv(ROOT_DIR + "csv_files/test/test_credit_bureau_a_2_1.csv").pipe(set_table_dtypes),
        pl.read_csv(ROOT_DIR + "csv_files/test/test_credit_bureau_a_2_2.csv").pipe(set_table_dtypes),
        pl.read_csv(ROOT_DIR + "csv_files/test/test_credit_bureau_a_2_3.csv").pipe(set_table_dtypes),
        pl.read_csv(ROOT_DIR + "csv_files/test/test_credit_bureau_a_2_4.csv").pipe(set_table_dtypes),
        pl.read_csv(ROOT_DIR + "csv_files/test/test_credit_bureau_a_2_5.csv").pipe(set_table_dtypes),
        pl.read_csv(ROOT_DIR + "csv_files/test/test_credit_bureau_a_2_6.csv").pipe(set_table_dtypes),
        pl.read_csv(ROOT_DIR + "csv_files/test/test_credit_bureau_a_2_7.csv").pipe(set_table_dtypes),
        pl.read_csv(ROOT_DIR + "csv_files/test/test_credit_bureau_a_2_8.csv").pipe(set_table_dtypes),
        pl.read_csv(ROOT_DIR + "csv_files/test/test_credit_bureau_a_2_9.csv").pipe(set_table_dtypes),
        pl.read_csv(ROOT_DIR + "csv_files/test/test_credit_bureau_a_2_10.csv").pipe(set_table_dtypes),
        pl.read_csv(ROOT_DIR + "csv_files/test/test_credit_bureau_a_2_11.csv").pipe(set_table_dtypes),
    ],
    how="vertical_relaxed",
)

In [None]:
test_credit_bureau_b_depth1 = pl.read_csv(ROOT_DIR + "csv_files/test/test_credit_bureau_b_1.csv").pipe(set_table_dtypes)
test_credit_bureau_b_depth2 = pl.read_csv(ROOT_DIR + "csv_files/test/test_credit_bureau_b_2.csv").pipe(set_table_dtypes)

In [None]:
test_debit_card_depth1 = pl.read_csv(ROOT_DIR + "csv_files/test/test_debitcard_1.csv").pipe(set_table_dtypes)
test_deposit_depth1 = pl.read_csv(ROOT_DIR + "csv_files/test/test_deposit_1.csv").pipe(set_table_dtypes)
test_other_depth1 = pl.read_csv(ROOT_DIR + "csv_files/test/test_other_1.csv").pipe(set_table_dtypes)

test_person_depth1 = pl.read_csv(ROOT_DIR + "csv_files/test/test_person_1.csv").pipe(set_table_dtypes)
test_person_depth2 = pl.read_csv(ROOT_DIR + "csv_files/test/test_person_2.csv").pipe(set_table_dtypes)

In [None]:
test_static_depth0 = pl.concat(
    [
        pl.read_csv(ROOT_DIR + "csv_files/test/test_static_0_0.csv").pipe(set_table_dtypes),
        pl.read_csv(ROOT_DIR + "csv_files/test/test_static_0_1.csv").pipe(set_table_dtypes),
        pl.read_csv(ROOT_DIR + "csv_files/test/test_static_0_2.csv").pipe(set_table_dtypes),
    ],
    how="vertical_relaxed",
)

test_static_cb_depth0 = pl.read_csv(ROOT_DIR + "csv_files/test/test_static_cb_0.csv").pipe(set_table_dtypes)

In [None]:
test_tax_registry_a_depth1 = pl.read_csv(ROOT_DIR + "csv_files/test/test_tax_registry_a_1.csv").pipe(set_table_dtypes),
test_tax_registry_b_depth1 = pl.read_csv(ROOT_DIR + "csv_files/test/test_tax_registry_b_1.csv").pipe(set_table_dtypes),
test_tax_registry_c_depth1 = pl.read_csv(ROOT_DIR + "csv_files/test/test_tax_registry_c_1.csv").pipe(set_table_dtypes),

In [None]:
test_base_table_depth0
test_appl_prev_depth1
test_appl_prev_depth2

test_credit_bureau_a_depth1
test_credit_bureau_a_depth2
test_credit_bureau_b_depth1
test_credit_bureau_b_depth2

test_debit_card_depth1
test_deposit_depth1
test_other_depth1
test_person_depth1
test_person_depth2

test_static_depth0
test_static_cb_depth0

test_tax_registry_a_depth1
test_tax_registry_b_depth1
test_tax_registry_c_depth1