> The goal is to create a model that is stable and performs well in the future.

#### Load the data

In [1]:
from eda_function import *

import polars as pl
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

dataPath = "../data/"
parquetPath = dataPath + "parquet_files/"
parquetTrain = parquetPath + "train/"
parquetTest = parquetPath + "test/"

In [2]:
# Following function is just an example

def set_table_dtypes(df: pl.DataFrame) -> pl.DataFrame:
    # implement all desired dtypes for tables
    for col in df.columns:
        if col[-1] in ("P", "A"):
            df = df.with_columns(pl.col(col).cast(pl.Float64).alias(col))

    return df

def convert_strings(df: pd.DataFrame) -> pd.DataFrame:
    for col in df.columns:  
        if df[col].dtype.name in ['object', 'string']:
            df[col] = df[col].astype("string").astype('category')
            current_categories = df[col].cat.categories
            new_categories = current_categories.to_list() + ["Unknown"]
            new_dtype = pd.CategoricalDtype(categories=new_categories, ordered=True)
            df[col] = df[col].astype(new_dtype)
    return df

In [3]:
train_base = pl.read_parquet(parquetTrain + 'train_base.parquet')
train_static = pl.concat(
    [
        pl.read_parquet(parquetTrain + 'train_static_0_0.parquet').pipe(set_table_dtypes),
        pl.read_parquet(parquetTrain + 'train_static_0_1.parquet').pipe(set_table_dtypes),        
    ],
    how='vertical_relaxed',
)

# train_static_cb = pl.read_parquet(parquetTrain + 'train_static_cb_0.parquet').pipe(set_table_dtypes)
# train_person_1 = pl.read_parquet(parquetTrain + 'train_person_1.parquet').pipe(set_table_dtypes)
# train_credit_bureau_b_2 = pl.read_parquet(parquetTrain + 'train_credit_bureau_b_2.parquet').pipe(set_table_dtypes)

In [4]:
test_basetable = pl.read_parquet(dataPath + "parquet_files/test/test_base.parquet")
test_static = pl.concat(
    [
        pl.read_parquet(dataPath + "parquet_files/test/test_static_0_0.parquet").pipe(set_table_dtypes),
        pl.read_parquet(dataPath + "parquet_files/test/test_static_0_1.parquet").pipe(set_table_dtypes),
        pl.read_parquet(dataPath + "parquet_files/test/test_static_0_2.parquet").pipe(set_table_dtypes),
    ],
    how="vertical_relaxed",
)
# test_static_cb = pl.read_parquet(dataPath + "parquet_files/test/test_static_cb_0.parquet").pipe(set_table_dtypes)
# test_person_1 = pl.read_parquet(dataPath + "parquet_files/test/test_person_1.parquet").pipe(set_table_dtypes) 
# test_credit_bureau_b_2 = pl.read_parquet(dataPath + "parquet_files/test/test_credit_bureau_b_2.parquet").pipe(set_table_dtypes) 

#### Feature engineering

In [5]:
# We will process in this examples only A-type and M-type columns, so we need to select them.
# selected_static_cols = []
# for col in train_static.columns:
#     if col[-1] in ("A", "M"):
#         selected_static_cols.append(col)
# print(selected_static_cols)

# selected_static_cb_cols = []
# for col in train_static_cb.columns:
#     if col[-1] in ("A", "M"):
#         selected_static_cb_cols.append(col)
# print(selected_static_cb_cols)

In [6]:
# Join all tables together.
data = train_base.join(
    train_static, how="left", on="case_id"
)
# .join(
#     train_static_cb.select(["case_id"]+selected_static_cb_cols), how="left", on="case_id"
# ).join(
#     train_person_1_feats_1, how="left", on="case_id"
# ).join(
#     train_person_1_feats_2, how="left", on="case_id"
# ).join(
#     train_credit_bureau_b_2_feats, how="left", on="case_id"
# )

In [7]:
## Same on test
# test_person_1_feats_1 = test_person_1.group_by("case_id").agg(
#     pl.col("mainoccupationinc_384A").max().alias("mainoccupationinc_384A_max"),
#     (pl.col("incometype_1044T") == "SELFEMPLOYED").max().alias("mainoccupationinc_384A_any_selfemployed")
# )

# test_person_1_feats_2 = test_person_1.select(["case_id", "num_group1", "housetype_905L"]).filter(
#     pl.col("num_group1") == 0
# ).drop("num_group1").rename({"housetype_905L": "person_housetype"})

# test_credit_bureau_b_2_feats = test_credit_bureau_b_2.group_by("case_id").agg(
#     pl.col("pmts_pmtsoverdue_635A").max().alias("pmts_pmtsoverdue_635A_max"),
#     (pl.col("pmts_dpdvalue_108P") > 31).max().alias("pmts_dpdvalue_108P_over31")
# )

data_submission = test_basetable.join(
    test_static, how="left", on="case_id"
)
# .join(
#     test_static_cb.select(["case_id"]+selected_static_cb_cols), how="left", on="case_id"
# ).join(
#     test_person_1_feats_1, how="left", on="case_id"
# ).join(
#     test_person_1_feats_2, how="left", on="case_id"
# ).join(
#     test_credit_bureau_b_2_feats, how="left", on="case_id"
# )

In [8]:
case_ids = data['case_id'].unique().shuffle(seed=1)
case_ids_train, case_ids_test = train_test_split(case_ids, train_size=0.6, random_state=1)
case_ids_valid, case_ids_test = train_test_split(case_ids_test, train_size=0.5, random_state=1)

cols_pred = []
for col in data.columns:
    if col[-1].isupper() and col[:-1].islower():
        cols_pred.append(col)

print(cols_pred)

def from_polars_to_pandas(case_ids: pl.DataFrame) -> pl.DataFrame:
    return (
        data.filter(pl.col("case_id").is_in(case_ids))[["case_id", "WEEK_NUM", "target"]].to_pandas(),
        data.filter(pl.col("case_id").is_in(case_ids))[cols_pred].to_pandas(),
        data.filter(pl.col("case_id").is_in(case_ids))["target"].to_pandas()
    )

['actualdpdtolerance_344P', 'amtinstpaidbefduel24m_4187115A', 'annuity_780A', 'annuitynextmonth_57A', 'applicationcnt_361L', 'applications30d_658L', 'applicationscnt_1086L', 'applicationscnt_464L', 'applicationscnt_629L', 'applicationscnt_867L', 'avgdbddpdlast24m_3658932P', 'avgdbddpdlast3m_4187120P', 'avgdbdtollast24m_4525197P', 'avgdpdtolclosure24_3658938P', 'avginstallast24m_3658937A', 'avglnamtstart24m_4525187A', 'avgmaxdpdlast9m_3716943P', 'avgoutstandbalancel6m_4187114A', 'avgpmtlast12m_4525200A', 'bankacctype_710L', 'cardtype_51L', 'clientscnt12m_3712952L', 'clientscnt3m_3712950L', 'clientscnt6m_3712949L', 'clientscnt_100L', 'clientscnt_1022L', 'clientscnt_1071L', 'clientscnt_1130L', 'clientscnt_136L', 'clientscnt_157L', 'clientscnt_257L', 'clientscnt_304L', 'clientscnt_360L', 'clientscnt_493L', 'clientscnt_533L', 'clientscnt_887L', 'clientscnt_946L', 'cntincpaycont9m_3716944L', 'cntpmts24_3658933L', 'commnoinclast6m_3546845L', 'credamount_770A', 'credtype_322L', 'currdebt_22A',

In [9]:
base_train, X_train, y_train = from_polars_to_pandas(case_ids_train)
base_valid, X_valid, y_valid = from_polars_to_pandas(case_ids_valid)
base_test, X_test, y_test = from_polars_to_pandas(case_ids_test)

for df in [X_train, X_valid, X_test]:
    df = convert_strings(df)

In [10]:
print(f"Train: {X_train.shape}")
print(f"Valid: {X_valid.shape}")
print(f"Test: {X_test.shape}")

Train: (915995, 167)
Valid: (305332, 167)
Test: (305332, 167)


#### Training LightGBM

In [11]:
lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_valid = lgb.Dataset(X_valid, label=y_valid, reference=lgb_train)

params = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "max_depth": 3,
    "num_leaves": 31,
    "learning_rate": 0.05,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "n_estimators": 1000,
    "verbose": -1,
    'random_state': 42,
}

gbm = lgb.train(
    params,
    lgb_train,
    valid_sets=lgb_valid,
    callbacks=[lgb.log_evaluation(50), lgb.early_stopping(10)]
)



Training until validation scores don't improve for 10 rounds
[50]	valid_0's auc: 0.752097
[100]	valid_0's auc: 0.766762
[150]	valid_0's auc: 0.77287
[200]	valid_0's auc: 0.7764
[250]	valid_0's auc: 0.779378
[300]	valid_0's auc: 0.781271
[350]	valid_0's auc: 0.783326
[400]	valid_0's auc: 0.784346
[450]	valid_0's auc: 0.785199
[500]	valid_0's auc: 0.786441
[550]	valid_0's auc: 0.7874
[600]	valid_0's auc: 0.78807
[650]	valid_0's auc: 0.788568
Early stopping, best iteration is:
[640]	valid_0's auc: 0.788607


In [12]:
for base, X in [(base_train, X_train), (base_valid, X_valid), (base_test, X_test)]:
    y_pred = gbm.predict(X, num_iteration=gbm.best_iteration)
    base["score"] = y_pred

print(f'The AUC score on the train set is: {roc_auc_score(base_train["target"], base_train["score"])}') 
print(f'The AUC score on the valid set is: {roc_auc_score(base_valid["target"], base_valid["score"])}') 
print(f'The AUC score on the test set is: {roc_auc_score(base_test["target"], base_test["score"])}') 

The AUC score on the train set is: 0.7990342077825373
The AUC score on the valid set is: 0.7886069393220483
The AUC score on the test set is: 0.7848292277298203


In [13]:
def gini_stability(base, w_fallingrate=88.0, w_resstd=-0.5):
    gini_in_time = base.loc[:, ["WEEK_NUM", "target", "score"]]\
        .sort_values("WEEK_NUM")\
        .groupby("WEEK_NUM")[["target", "score"]]\
        .apply(lambda x: 2*roc_auc_score(x["target"], x["score"])-1).tolist()
    
    x = np.arange(len(gini_in_time))
    y = gini_in_time
    a, b = np.polyfit(x, y, 1)
    y_hat = a*x + b
    residuals = y - y_hat
    res_std = np.std(residuals)
    avg_gini = np.mean(gini_in_time)
    return avg_gini + w_fallingrate * min(0, a) + w_resstd * res_std

stability_score_train = gini_stability(base_train)
stability_score_valid = gini_stability(base_valid)
stability_score_test = gini_stability(base_test)

print(f'The stability score on the train set is: {stability_score_train}') 
print(f'The stability score on the valid set is: {stability_score_valid}') 
print(f'The stability score on the test set is: {stability_score_test}') 

The stability score on the train set is: 0.575792759128923
The stability score on the valid set is: 0.5478025269840563
The stability score on the test set is: 0.5372402252533385


#### Submission

In [14]:
X_submission = data_submission[cols_pred].to_pandas()
X_submission['validfrom_1069D'] = X_submission['validfrom_1069D'].astype('object')
X_submission = convert_strings(X_submission)
categorical_cols = X_train.select_dtypes(include=['category']).columns

for col in categorical_cols:
    train_categories = set(X_train[col].cat.categories)
    submission_categories = set(X_submission[col].cat.categories)
    new_categories = submission_categories - train_categories
    X_submission.loc[X_submission[col].isin(new_categories), col] = "Unknown"
    new_dtype = pd.CategoricalDtype(categories=train_categories, ordered=True)
    X_train[col] = X_train[col].astype(new_dtype)
    X_submission[col] = X_submission[col].astype(new_dtype)

y_submission_pred = gbm.predict(X_submission, num_iteration=gbm.best_iteration)

In [15]:
submission = pd.DataFrame({
    "case_id": data_submission["case_id"].to_numpy(),
    "score": y_submission_pred
}).set_index('case_id')
submission.to_csv("./submission_base.csv")