# Home Credit 2024

# Import all required dependencies

In [1]:
import os
import gc
from glob import glob
from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd
import polars as pl

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
#warnings.simplefilter(action='ignore', category=FutureWarning)
#pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', 500)

import joblib

from sklearn.model_selection import StratifiedGroupKFold, cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import VotingClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

import optuna

import lightgbm as lgb
#import xgboost as xgb
#from sklearn.linear_model import LogisticRegression
#from sklearn.svm import SVC, LinearSVC
#from sklearn.ensemble import RandomForestClassifier
#from sklearn.neighbors import KNeighborsClassifier
#from sklearn.naive_bayes import GaussianNB
#from sklearn.linear_model import Perceptron
#from sklearn.linear_model import SGDClassifier
#from sklearn.tree import DecisionTreeClassifier
#from sklearn.ensemble import GradientBoostingClassifier, HistGradientBoostingClassifier



# Load Data

Depth values:

* depth=0 - These are static features directly tied to a specific *case_id*.
* depth=1 - Each *case_id* has an associated historical record, indexed by *num_group1*.
* depth=2 - Each *case_id* has an associated historical record, indexed by both *num_group1* and *num_group2*.

## Configure input paths


In [2]:
class CFG:
    root_dir = Path("/kaggle/input/home-credit-credit-risk-model-stability/")
    train_dir = Path("/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/train")
    test_dir = Path("/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/test")

## Load feature definitions

In [3]:
feature_definitions_df = pd.read_csv(CFG.root_dir / "feature_definitions.csv")
display(feature_definitions_df)
pd.reset_option("display.max_rows", 0)

Unnamed: 0,Variable,Description
0,actualdpd_943P,Days Past Due (DPD) of previous contract (actu...
1,actualdpdtolerance_344P,DPD of client with tolerance.
2,addres_district_368M,District of the person's address.
3,addres_role_871L,Role of person's address.
4,addres_zip_823M,Zip code of the address.
...,...,...
460,totinstallast1m_4525188A,Total amount of monthly instalments paid in th...
461,twobodfilling_608L,Type of application process.
462,type_25L,Contact type of a person.
463,typesuite_864L,Persons accompanying the client during the loa...


## Data Collection and Preprocessing using Pipeline

In [4]:
class Pipeline:
    @staticmethod
    def set_table_dtypes(df):
        for col in df.columns:
            if col in ["case_id", "WEEK_NUM", "num_group1", "num_group2"]:
                df = df.with_columns(pl.col(col).cast(pl.Int64))
            elif col in ["date_decision"]:
                df = df.with_columns(pl.col(col).cast(pl.Date))
            elif col[-1] in ("P", "A"):
                df = df.with_columns(pl.col(col).cast(pl.Float64))
            elif col[-1] in ("M",):
                df = df.with_columns(pl.col(col).cast(pl.String))
            elif col[-1] in ("D",):
                df = df.with_columns(pl.col(col).cast(pl.Date))            

        return df
    
    @staticmethod
    def handle_dates(df):
        for col in df.columns:
            if col[-1] in ("D",):
                df = df.with_columns(pl.col(col) - pl.col("date_decision"))
                df = df.with_columns(pl.col(col).dt.total_days())
                
        df = df.drop("date_decision", "MONTH")

        return df
    
    @staticmethod
    def filter_cols(df):
        for col in df.columns:
            if col not in ["target", "case_id", "WEEK_NUM"]:
                isnull = df[col].is_null().mean()

                if isnull > 0.95:
                    df = df.drop(col)

        for col in df.columns:
            if (col not in ["target", "case_id", "WEEK_NUM"]) & (df[col].dtype == pl.String):
                freq = df[col].n_unique()

                if (freq == 1) | (freq > 200):
                    df = df.drop(col)

        return df

## Aggregate data from different datasets

In [5]:
class Aggregator:
    num_aggregators = [pl.max, pl.min, pl.first, pl.last, pl.mean]
    str_aggregators = [pl.max, pl.min, pl.first, pl.last] # n_unique
    group_aggregators = [pl.max, pl.min, pl.first, pl.last]
    
    @staticmethod
    def num_expr(df):
        cols = [col for col in df.columns if col[-1] in ("P", "A")]
        expr_all = []
        for method in Aggregator.num_aggregators:
            expr = [method(col).alias(f"{method.__name__}_{col}") for col in cols]
            expr_all += expr

        return expr_all

    @staticmethod
    def date_expr(df):
        cols = [col for col in df.columns if col[-1] in ("D",)]
        expr_all = []
        for method in Aggregator.num_aggregators:
            expr = [method(col).alias(f"{method.__name__}_{col}") for col in cols]  
            expr_all += expr

        return expr_all

    @staticmethod
    def str_expr(df):
        cols = [col for col in df.columns if col[-1] in ("M",)]
        
        expr_all = []
        for method in Aggregator.str_aggregators:
            expr = [method(col).alias(f"{method.__name__}_{col}") for col in cols]  
            expr_all += expr
            
        expr_mode = [
            pl.col(col)
            .drop_nulls()
            .mode()
            .first()
            .alias(f"mode_{col}")
            for col in cols
        ]

        return expr_all + expr_mode

    @staticmethod
    def other_expr(df):
        cols = [col for col in df.columns if col[-1] in ("T", "L")]
        
        expr_all = []
        for method in Aggregator.str_aggregators:
            expr = [method(col).alias(f"{method.__name__}_{col}") for col in cols]  
            expr_all += expr

        return expr_all
    
    @staticmethod
    def count_expr(df):
        cols = [col for col in df.columns if "num_group" in col]

        expr_all = []
        for method in Aggregator.group_aggregators:
            expr = [method(col).alias(f"{method.__name__}_{col}") for col in cols]  
            expr_all += expr
            
#         if len(cols) > 0:
#             method = pl.count
#             expr = [method(col).alias(f"{method.__name__}_{col}") for col in [cols[0]]]
#             expr_all += expr

        return expr_all

    @staticmethod
    def get_exprs(df):
        exprs = Aggregator.num_expr(df) + \
                Aggregator.date_expr(df) + \
                Aggregator.str_expr(df) + \
                Aggregator.other_expr(df) + \
                Aggregator.count_expr(df)

        return exprs

## Read files with previous function application

In [6]:
def read_file(path, depth=None):
    df = pl.read_parquet(path)
    df = df.pipe(Pipeline.set_table_dtypes)
    
    if depth in [1, 2]:
        df = df.group_by("case_id").agg(Aggregator.get_exprs(df))
    
    return df

def read_files(regex_path, depth=None):
    chunks = []
    for path in glob(str(regex_path)):
        chunks.append(pl.read_parquet(path).pipe(Pipeline.set_table_dtypes))
        
    df = pl.concat(chunks, how="vertical_relaxed")
    if depth in [1, 2]:
        df = df.group_by("case_id").agg(Aggregator.get_exprs(df))
    
    return df

## Feature Engineering

In [7]:
def feature_eng(df_base, depth_0, depth_1, depth_2):
    df_base = (
        df_base
        .with_columns(
            month_decision = pl.col("date_decision").dt.month(),
            weekday_decision = pl.col("date_decision").dt.weekday(),
        )
    )
        
    for i, df in enumerate(depth_0 + depth_1 + depth_2):
        df_base = df_base.join(df, how="left", on="case_id", suffix=f"_{i}")
        
    df_base = df_base.pipe(Pipeline.handle_dates)
    
    return df_base

In [8]:
def to_pandas(df_data, cat_cols=None):
    df_data = df_data.to_pandas()
    
    if cat_cols is None:
        cat_cols = list(df_data.select_dtypes("object").columns)
    
    df_data[cat_cols] = df_data[cat_cols].astype("category")
    
    return df_data

In [9]:
def reduce_mem_usage(df, float16_as32=True):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        if str(col_type)=="category":
            continue
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    if float16_as32:
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float16)                    
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')
    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

## Prepare Dataframe

In [10]:
def prepare_df(data_dir, cat_cols=None, mode="train", display_store=False, train_cols=[]):
    print("Collecting data...")
    data_store = {
        "df_base": read_file(data_dir / f"{mode}_base.parquet"),
        "depth_0": [
            read_file(data_dir / f"{mode}_static_cb_0.parquet"),
            read_files(data_dir / f"{mode}_static_0_*.parquet"),
        ],
        "depth_1": [
            read_files(data_dir / f"{mode}_applprev_1_*.parquet", 1),
            read_file(data_dir / f"{mode}_tax_registry_a_1.parquet", 1),
            read_file(data_dir / f"{mode}_tax_registry_b_1.parquet", 1),
            read_file(data_dir / f"{mode}_tax_registry_c_1.parquet", 1),
            read_file(data_dir / f"{mode}_credit_bureau_b_1.parquet", 1),
            read_file(data_dir / f"{mode}_other_1.parquet", 1),
            read_file(data_dir / f"{mode}_person_1.parquet", 1),
            read_file(data_dir / f"{mode}_deposit_1.parquet", 1),
            read_file(data_dir / f"{mode}_debitcard_1.parquet", 1),
        ],
        "depth_2": [
            read_file(data_dir / f"{mode}_credit_bureau_b_2.parquet", 2),
        ]
    }
    if display_store:
        display(data_store)
    
    print("Feature engeneering...")
    feats_df = feature_eng(**data_store)
    print("  feats_df shape:\t", feats_df.shape)
    
    del data_store
    gc.collect()
    
    print("Filter cols...")
    if mode == "train":
        feats_df = feats_df.pipe(Pipeline.filter_cols)
    else:
        train_cols = feats_df.columns if len(train_cols) == 0 else train_cols
        feats_df = feats_df.select([col for col in train_cols if col != "target"])
    print("  feats_df shape:\t", feats_df.shape)
    
    print("Convert to pandas...")
    feats_df = to_pandas(feats_df, cat_cols)
    return feats_df

### Configure train data

In [11]:
train_df = prepare_df(CFG.train_dir)
cat_cols = list(train_df.select_dtypes("category").columns)

Collecting data...
Feature engeneering...
  feats_df shape:	 (1526659, 927)
Filter cols...
  feats_df shape:	 (1526659, 516)
Convert to pandas...


In [12]:
#display(train_df)

In [13]:
#display(cat_cols)

### Configure test data

In [14]:
test_df = prepare_df(CFG.test_dir, cat_cols=cat_cols, mode="test", train_cols=train_df.columns)

Collecting data...
Feature engeneering...
  feats_df shape:	 (10, 926)
Filter cols...
  feats_df shape:	 (10, 515)
Convert to pandas...


In [15]:
#display(test_df)

### Reduce memory usage and save

In [16]:
train_df = reduce_mem_usage(train_df)
test_df = reduce_mem_usage(test_df)

Memory usage of dataframe is 4574.60 MB
Memory usage after optimization is: 2357.21 MB
Decreased by 48.5%
Memory usage of dataframe is 0.05 MB
Memory usage after optimization is: 0.04 MB
Decreased by 15.4%


In [17]:
#train_df.to_parquet("train_full.parquet")

# Exploration

In [18]:
print("Train is duplicated:\t", train_df["case_id"].duplicated().any())
print("Train Week Range:\t", (train_df["WEEK_NUM"].min(), train_df["WEEK_NUM"].max()))

print()

print("Test is duplicated:\t", test_df["case_id"].duplicated().any())
print("Test Week Range:\t", (test_df["WEEK_NUM"].min(), test_df["WEEK_NUM"].max()))


'\nprint("Train is duplicated:\t", train_df["case_id"].duplicated().any())\nprint("Train Week Range:\t", (train_df["WEEK_NUM"].min(), train_df["WEEK_NUM"].max()))\n\nprint()\n\nprint("Test is duplicated:\t", test_df["case_id"].duplicated().any())\nprint("Test Week Range:\t", (test_df["WEEK_NUM"].min(), test_df["WEEK_NUM"].max()))\n'

In [19]:
sns.lineplot(
        data=train_df,
        x="WEEK_NUM",
        y="target",
)
plt.show()


'\nsns.lineplot(\n        data=train_df,\n        x="WEEK_NUM",\n        y="target",\n)\nplt.show()\n'

### Features - Missed values - Unique values

In [20]:
def dataframe_summary(dataframe):
    summary = dict()
    for col in dataframe.columns:
        missed_val = dataframe[col].isna().sum()
        summary[col] = {'missed': missed_val, 
                        'missed_percentage': missed_val*100 / dataframe.shape[0], 
                        'unique': dataframe[col].unique().__len__(),
                        'type': dataframe[col].dtype} 
    return pd.DataFrame.from_dict(summary)

train_df_summary = dataframe_summary(train_df).T
test_df_summary = dataframe_summary(test_df).T

### Now we have summary for our dataframes with:
* *missed*, missed values of the feature
* *missed_percentage*, percent of missed values in the column
* *unique*, number of unique values
* *type*, datatype of the feature

In [21]:
train_df_summary.T

In [22]:
train_df_summary[(train_df_summary.missed_percentage < 5) & (train_df_summary.type == 'category')].T

### Categorical features with large amount of unique values:
* lastapprcommoditycat_1041M - **45**
* lastcancelreason_561M - **74**
* lastrejectcommoditycat_161M - **45**
* lastrejectcommodtypec_5251769M - **187**

In [23]:
train_df.lastapprcommoditycat_1041M.unique()

In [24]:
train_df.lastcancelreason_561M.unique()

In [25]:
train_df.lastrejectcommoditycat_161M.unique()

In [26]:
train_df.lastrejectcommodtypec_5251769M.unique()

### Extra preprocessing:
* extract only features with 5% and less missed data
* replaced missed values with MEAN and MODE
* encode categorical data with less than 45 categories

In [27]:
def fillna_n_encode(train_dataframe, test_dataframe, summary):
    
    # define numeric and categorical columns with no more than 5% missed values
    print('Defining columns...')
    num_feat = summary[(summary.missed_percentage < 5) & (summary.type != 'category')].T.columns.to_list()
    num_feat.pop(2) # remove TARGET
    cat_feat = summary[(summary.missed_percentage < 5) & (summary.type == 'category') & (summary.unique < 45)].T.columns.to_list()
    
    # concat dataframes
    train_size = train_dataframe.shape[0]
    concat_df = pd.concat([train_dataframe, test_dataframe], ignore_index=True)
    
    # fill numeric missed values with mean value for the feature
    print('Fill missed numbers...')
    concat_df[num_feat] = concat_df[num_feat].apply(lambda x: x.fillna(x.mean()),axis=0)

    # fill categorical missed values with mode value for the feature
    print('Fill missed categories...')
    concat_df[cat_feat] = concat_df[cat_feat].apply(lambda x: x.fillna(x.mode().iloc[0]),axis=0)

    # redefine dataframe with only necesary columns
    concat_df = concat_df[num_feat + cat_feat + ['target']]
    
    # convert CATEGORY data with encoder
    print('Encoding...')
    ohe = OneHotEncoder(dtype=np.int8, drop='first')
    ohe.fit(concat_df[cat_feat])
    temp_df = pd.DataFrame(data=ohe.transform(concat_df[cat_feat]).toarray(), columns=ohe.get_feature_names_out())
    concat_df = pd.concat([concat_df.reset_index(drop=True), temp_df], axis=1)
    
    # drop columns
    concat_df.drop(cat_feat, axis=1, inplace=True)
    print('Done!')               
    return concat_df.iloc[:train_size], concat_df.iloc[train_size:] 
    
train_df, test_df = fillna_n_encode(train_df, test_df, train_df_summary)
test_df = test_df.drop('target', axis=1)

Defining columns...


  concat_df = pd.concat([train_dataframe, test_dataframe], ignore_index=True)
  concat_df = pd.concat([train_dataframe, test_dataframe], ignore_index=True)
  concat_df = pd.concat([train_dataframe, test_dataframe], ignore_index=True)
  concat_df = pd.concat([train_dataframe, test_dataframe], ignore_index=True)
  concat_df = pd.concat([train_dataframe, test_dataframe], ignore_index=True)
  concat_df = pd.concat([train_dataframe, test_dataframe], ignore_index=True)
  concat_df = pd.concat([train_dataframe, test_dataframe], ignore_index=True)
  concat_df = pd.concat([train_dataframe, test_dataframe], ignore_index=True)
  concat_df = pd.concat([train_dataframe, test_dataframe], ignore_index=True)
  concat_df = pd.concat([train_dataframe, test_dataframe], ignore_index=True)
  concat_df = pd.concat([train_dataframe, test_dataframe], ignore_index=True)
  concat_df = pd.concat([train_dataframe, test_dataframe], ignore_index=True)
  concat_df = pd.concat([train_dataframe, test_dataframe], ignor

Fill missed numbers...
Fill missed categories...


  concat_df[cat_feat] = concat_df[cat_feat].apply(lambda x: x.fillna(x.mode().iloc[0]),axis=0)


Encoding...
Done!


In [28]:
del train_df_summary 
del test_df_summary

# Modeling

## Feature importance

In [31]:
# TBD

In [29]:
drop_cols = []
# drop_cols_startwith = ["std_"]
# for name_prefix in drop_cols_startwith:
#     cols_names = train_df.columns[train_df.columns.str.startswith(name_prefix)]
#     drop_cols += cols_names.to_list()
# display(drop_cols)

## Define custom metric

In [30]:
def gini_stability(base, score_col="score", w_fallingrate=88.0, w_resstd=-0.5):
    gini_in_time = base.loc[:, ["WEEK_NUM", "target", score_col]]\
        .sort_values("WEEK_NUM")\
        .groupby("WEEK_NUM")[["target", score_col]]\
        .apply(lambda x: 2*roc_auc_score(x["target"], x[score_col])-1).tolist()
    
    x = np.arange(len(gini_in_time))
    y = gini_in_time
    a, b = np.polyfit(x, y, 1)
    y_hat = a*x + b
    residuals = y - y_hat
    res_std = np.std(residuals)
    avg_gini = np.mean(gini_in_time)
    return avg_gini + w_fallingrate * min(0, a) + w_resstd * res_std

## Test default models

In [34]:
models = [
    #SVC(probability=True),
    LogisticRegression(max_iter=10000),
    KNeighborsClassifier(),
    RandomForestClassifier(),
    GradientBoostingClassifier(),
    HistGradientBoostingClassifier(),
    GaussianNB(),
    DecisionTreeClassifier(),
    xgb.XGBClassifier(),
    lgb.LGBMClassifier()
]


In [35]:
train_df_100000 = train_df.sample(n=100000)
X = train_df_100000.drop(columns=["target", "case_id", "WEEK_NUM"] + drop_cols)
print("X shape: ", X.shape)
y = train_df_100000["target"]
weeks = train_df_100000["WEEK_NUM"]

results = dict()

# look through models
for model in models:

    print("#-------------------------------------------------------------------#")
    print(type(model).__name__)

    # calculate score
    res = (cross_val_score(model, X, y, cv=5, scoring="roc_auc")).mean()
    print(f"ROC AUC score: {res}")

    # record data
    results[type(model).__name__] = res


X shape:  (100000, 354)
#-------------------------------------------------------------------#
LogisticRegression
ROC AUC score: 0.6098035001557571
#-------------------------------------------------------------------#
KNeighborsClassifier
ROC AUC score: 0.5135698395514187
#-------------------------------------------------------------------#
RandomForestClassifier
ROC AUC score: 0.6832854076839169
#-------------------------------------------------------------------#
GradientBoostingClassifier
ROC AUC score: 0.7445007835270747
#-------------------------------------------------------------------#
HistGradientBoostingClassifier
ROC AUC score: 0.7383956147099674
#-------------------------------------------------------------------#
GaussianNB
ROC AUC score: 0.6212964548050978
#-------------------------------------------------------------------#
DecisionTreeClassifier
ROC AUC score: 0.515162060777212
#-------------------------------------------------------------------#
XGBClassifier
ROC AUC sc

In [37]:
'''
 'LogisticRegression':             0.6098035001557571,
 'KNeighborsClassifier':           0.5135698395514187,
 'RandomForestClassifier':         0.6832854076839169,
 'GradientBoostingClassifier':     0.7445007835270747,
 'HistGradientBoostingClassifier': 0.7383956147099674,
 'GaussianNB':                     0.6212964548050978,
 'DecisionTreeClassifier':         0.515162060777212,
 'XGBClassifier':                  0.7161658831137498,
 'LGBMClassifier':                 0.7373580474525102}
'''

"\n 'LogisticRegression':             0.6098035001557571,\n 'KNeighborsClassifier':           0.5135698395514187,\n 'RandomForestClassifier':         0.6832854076839169,\n 'GradientBoostingClassifier':     0.7445007835270747,\n 'HistGradientBoostingClassifier': 0.7383956147099674,\n 'GaussianNB':                     0.6212964548050978,\n 'DecisionTreeClassifier':         0.515162060777212,\n 'XGBClassifier':                  0.7161658831137498,\n 'LGBMClassifier':                 0.7373580474525102}\n"

## LGBM 
(best: )

In [36]:
X = train_df.drop(columns=["target", "case_id", "WEEK_NUM"] + drop_cols)
print("X shape: ", X.shape)
y = train_df["target"]
weeks = train_df["WEEK_NUM"]

cv = StratifiedGroupKFold(n_splits=5, shuffle=False)
"""
params = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "max_depth": 8,
    "max_bin": 255,
    "learning_rate": 0.05,
    "n_estimators": 1000,
    "colsample_bytree": 0.8, 
    "colsample_bynode": 0.8,
    "verbose": -1,
    "random_state": 42,
    "device": "gpu",
}
"""
# params after tuning
params = {'num_leaves': 34, 'max_depth': 3, 'learning_rate': 0.03932342591510133, 'n_estimators': 293, 'max_bin': 32, 'boosting': 'dart', 'tree_learner': 'voting',
    "verbose": -1,
    "random_state": 42,
    "device": "gpu",
    "metric": "auc",}


fitted_models = []
oof_pred = np.zeros(X.shape[0])

for idx_train, idx_valid in cv.split(X, y, groups=weeks):
    X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
    X_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]

    model = lgb.LGBMClassifier(**params)
    model.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        callbacks=[lgb.log_evaluation(100), lgb.early_stopping(100)]
    )
    fitted_models.append(model)
    val_pred = model.predict_proba(X_valid)[:, 1]
    oof_pred[idx_valid] = val_pred
    gc.collect()
#'''

X shape:  (1526659, 305)




[100]	valid_0's auc: 0.689746
[200]	valid_0's auc: 0.694182


KeyboardInterrupt: 

In [None]:
roc_auc_oof = roc_auc_score(y, oof_pred)
print("CV roc_auc_oof: ", roc_auc_oof)


In [None]:
oof_df = train_df[["WEEK_NUM", "target"]].copy()
oof_df["pred_oof"] = oof_pred
gini_score = gini_stability(oof_df, score_col="pred_oof")
print("gini_score:\t", gini_score)


In [None]:
oof_models_dict = [(str(i), model) for i, model in enumerate(fitted_models)]

model = VotingClassifier(
    estimators=oof_models_dict,
    voting='soft',
)
model.estimators_ = fitted_models
model.le_ = LabelEncoder().fit(y)
model.classes_ = model.le_.classes_


In [None]:
joblib.dump(model, "oof_model_1.pkl")

In [None]:
joblib.dump((train_df.columns, cat_cols, drop_cols), "train_cat_columns.pkl")

In [None]:
joblib.dump(oof_pred, "oof_pred.pkl")

## XGBooster (mostly show worse results)
(best: )

In [None]:
X = train_df.drop(columns=["target", "case_id", "WEEK_NUM"] + drop_cols)
print("X shape: ", X.shape)
y = train_df["target"]
weeks = train_df["WEEK_NUM"]

cv = StratifiedGroupKFold(n_splits=5, shuffle=False)

params = {
    "booster": "dart", #  gbtee
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "max_depth": 8,
    "max_bin": 255,
    "learning_rate": 0.05,
    #"n_estimators": 1000,
    #"verbose": -1,
    "random_state": 42,
    "device": "gpu",
}

fitted_models = []
oof_pred = np.zeros(X.shape[0])

early_stop = xgb.callback.EarlyStopping(rounds=100,
                                        metric_name='auc',
                                        data_name='Valid')

for idx_train, idx_valid in cv.split(X, y, groups=weeks):

    D_train = xgb.DMatrix(X.iloc[idx_train], y.iloc[idx_train])
    D_valid = xgb.DMatrix(X.iloc[idx_valid], y.iloc[idx_valid])

    model = xgb.train(
        {**params},
        D_train,
        evals=[(D_train, 'Train'), (D_valid, 'Valid')],
        verbose_eval=100,
        callbacks=[early_stop]        
    )
    fitted_models.append(model)
    val_pred = model.predict(D_valid)
    oof_pred[idx_valid] = val_pred
    gc.collect()
    


In [None]:
roc_auc_oof = roc_auc_score(y, oof_pred)
print("CV roc_auc_oof: ", roc_auc_oof)

In [None]:
oof_df = train_df[["WEEK_NUM", "target"]].copy()
oof_df["pred_oof"] = oof_pred
gini_score = gini_stability(oof_df, score_col="pred_oof")
print("gini_score:\t", gini_score)

In [None]:
oof_models_dict = [(str(i), model) for i, model in enumerate(fitted_models)]

model = VotingClassifier(
    estimators=oof_models_dict,
    voting='soft',
)
model.estimators_ = fitted_models
model.le_ = LabelEncoder().fit(y)
model.classes_ = model.le_.classes_

# Hypertuning

### Grad Booster
(best: )

In [None]:

train_df_subsample = train_df.sample(n=10000)
X = train_df_subsample.drop(columns=["target", "case_id", "WEEK_NUM"] + drop_cols)
print("X shape: ", X.shape)
y = train_df_subsample["target"]
weeks = train_df_subsample["WEEK_NUM"]

def objective(trial):
    """Define the objective function"""

    params = {
        'loss': trial.suggest_categorical('loss', ["log_loss", "exponential"]),
        'n_estimators': trial.suggest_int('n_estimators', 10, 1000),
        'max_depth': trial.suggest_int('max_depth', 1, 100),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0, log=True),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 50),
        'max_features': trial.suggest_categorical('max_features', ["sqrt", "log2", None]),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 50),
        'criterion': trial.suggest_categorical('criterion', ["friedman_mse", "squared_error"]),
        }

    # Fit the model
    optuna_model = GradientBoostingClassifier(**params)
    scores = cross_val_score(optuna_model, X, y, cv=3, scoring="roc_auc")
    score = scores.mean()
    return score


study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

print('Number of finished trials: {}'.format(len(study.trials)))
print('Best trial:')
trial = study.best_trial

print('  Value: {}'.format(trial.value))
print('  Params: ')

for key, value in trial.params.items():
    print('    {}: {}'.format(key, value))

[I 2024-02-17 11:29:32,051] A new study created in memory with name: no-name-82961765-5356-404c-b95d-6296b145f1cb


X shape:  (10000, 354)


[I 2024-02-17 11:29:34,896] Trial 0 finished with value: 0.650293508752236 and parameters: {'loss': 'exponential', 'n_estimators': 99, 'max_depth': 85, 'learning_rate': 0.5129796947676725, 'min_samples_leaf': 32, 'max_features': 'log2', 'min_samples_split': 14, 'criterion': 'friedman_mse'}. Best is trial 0 with value: 0.650293508752236.
[I 2024-02-17 11:29:38,552] Trial 1 finished with value: 0.7082544529813631 and parameters: {'loss': 'log_loss', 'n_estimators': 365, 'max_depth': 4, 'learning_rate': 0.016142105051264524, 'min_samples_leaf': 25, 'max_features': 'log2', 'min_samples_split': 8, 'criterion': 'friedman_mse'}. Best is trial 1 with value: 0.7082544529813631.
[I 2024-02-17 11:32:47,067] Trial 2 finished with value: 0.6796333309142266 and parameters: {'loss': 'exponential', 'n_estimators': 198, 'max_depth': 33, 'learning_rate': 0.02643708435056029, 'min_samples_leaf': 12, 'max_features': None, 'min_samples_split': 2, 'criterion': 'friedman_mse'}. Best is trial 1 with value: 0.

### LGBM
(best: 0.6732)

In [None]:
train_df_subsample = train_df.sample(n=10000)
X = train_df_subsample.drop(columns=["target", "case_id", "WEEK_NUM"] + drop_cols)
print("X shape: ", X.shape)
y = train_df_subsample["target"]
weeks = train_df_subsample["WEEK_NUM"]

def objective(trial):
    """Define the objective function"""

    params = {
        'num_leaves': trial.suggest_int('num_leaves', 2, 100),
        'max_depth': trial.suggest_int('max_depth', 1, 100),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 10, 2000),
        'max_bin': trial.suggest_int('max_bin', 1, 255),
        'boosting': trial.suggest_categorical('boosting', ['gbdt', 'dart']),
        'objective': 'binary',
        'tree_learner': trial.suggest_categorical('tree_learner', ['serial', 'feature', 'data', 'voting']),

        "metric": 'auc',
        "verbosity": -1,
        "device": "gpu",
        "objective": "binary",
    }

    # Fit the model
    optuna_model = lgb.LGBMClassifier(**params)
    scores = cross_val_score(optuna_model, X, y, cv=3, scoring="roc_auc")
    score = scores.mean()
    return score


study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

print('Number of finished trials: {}'.format(len(study.trials)))
print('Best trial:')
trial = study.best_trial

print('  Value: {}'.format(trial.value))
print('  Params: ')

for key, value in trial.params.items():
    print('    {}: {}'.format(key, value))

[I 2024-02-17 12:38:05,139] A new study created in memory with name: no-name-adecbc94-6b05-4e56-8659-469ee3255662


X shape:  (10000, 305)


# Results

In [None]:
# TBD

# Prediction

In [None]:
def predict_proba_in_batches(model, data, batch_size=100000):
    num_samples = len(data)
    num_batches = int(np.ceil(num_samples / batch_size))
    probabilities = np.zeros((num_samples,))

    for batch_idx in range(num_batches):
        print(f"Processing batch: {batch_idx+1}/{num_batches}")
        start_idx = batch_idx * batch_size
        end_idx = min((batch_idx + 1) * batch_size, num_samples)
        X_batch = data.iloc[start_idx:end_idx]
        batch_probs = model.predict_proba(X_batch)[:, 1]
        probabilities[start_idx:end_idx] = batch_probs
        gc.collect()

    return probabilities

In [None]:
X_test = test_df.drop(columns=["WEEK_NUM"] + drop_cols)
X_test = X_test.set_index("case_id")
print("X_test shape: ", X_test.shape)

y_pred = pd.Series(predict_proba_in_batches(model, X_test), index=X_test.index)
y_pred[:10]

# Submit

In [None]:
subm_df = pd.read_csv(CFG.root_dir / "sample_submission.csv")
subm_df = subm_df.set_index("case_id")

subm_df["score"] = y_pred

In [None]:
print("Check null: ", subm_df["score"].isnull().any())

subm_df.head()

In [None]:
subm_df.to_csv("submission.csv")