In [1]:
# 標準ライブラリ
import gc
import os
import pickle
import random
import sys
import warnings
from itertools import combinations, permutations
from pathlib import Path
import pytz


# サードパーティのライブラリ
import category_encoders as ce
import joblib
import lightgbm as lgb
import numpy as np
import pandas as pd
import polars as pl
import scipy as sp
import seaborn as sns
import xgboost as xgb
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
from dateutil.relativedelta import relativedelta
from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin
from sklearn.impute import KNNImputer
from sklearn.metrics import f1_score, log_loss, matthews_corrcoef, roc_auc_score
from sklearn.model_selection import (GroupKFold, KFold, StratifiedKFold,
                                     StratifiedGroupKFold, TimeSeriesSplit,
                                     train_test_split)
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder

from tqdm.auto import tqdm
from sklearn.linear_model import LogisticRegression

import matplotlib.pyplot as plt



warnings.filterwarnings('ignore')


from catboost import CatBoostClassifier, Pool  # type: ignore
from glob import glob
from IPython.display import display  # type: ignore
from pathlib import Path
from sklearn.base import BaseEstimator, ClassifierMixin  # type: ignore
from sklearn.metrics import roc_auc_score  # type: ignore
from sklearn.model_selection import StratifiedGroupKFold  # type: ignore
from typing import Any

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class CFG:
    home_directory = os.path.expanduser('~/kaggle_HomeCredit/')
    kaggle_directory = os.path.expanduser('/kaggle/input/home-credit-credit-risk-model-stability/')
    
    train_data_path = os.path.join(home_directory, 'train/')
    test_data_path = os.path.join(home_directory, 'test/')
    
    OOF_DATA_PATH = Path(home_directory) / 'oof'
    MODEL_DATA_PATH = Path(home_directory) / 'models'
    SUB_DATA_PATH = Path(home_directory) / 'submission'

    def __init__(self):
        self.create_directories()
    
    def create_directories(self):
        for path in [self.OOF_DATA_PATH, self.MODEL_DATA_PATH, self.SUB_DATA_PATH]:
            path.mkdir(parents=True, exist_ok=True)
    
    
    VER = 20_2
    AUTHOR = 'Mira'
    COMPETITION = 'HomeCredit'

    METHOD_LIST = ['lightgbm','catboost']
    #METHOD_LIST = ['lightgbm']
    seed = 28
    n_folds = 5
    target_col = 'target'
    metric = 'auc'
    
    metric_maximize_flag = True
    num_boost_round = 500
    early_stopping_round = 200
    verbose = 25
    classification_lgb_params = {
        "boosting_type": "gbdt",
        "objective": "binary",
        "metric": "auc",
        "max_depth": 10,  
        "learning_rate": 0.05,
        "n_estimators": num_boost_round,  
        "colsample_bytree": 0.8,
        "colsample_bynode": 0.8,
        "verbose": -1,
        "reg_alpha": 0.1,
        "reg_lambda": 10,
        "extra_trees":True,
        'num_leaves':64,
        'seed': seed,
    }
    classification_xgb_params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'learning_rate': 0.05,
        'random_state': seed,
        "tree_method": "gpu_hist",
    }

    classification_cat_params = {
        'iterations':num_boost_round,                
        'learning_rate':0.05,
        'depth':10,                     
        'l2_leaf_reg':10,                  
        'loss_function':'Logloss',        
        'eval_metric':'AUC',              
        'bootstrap_type':'Bernoulli',    
        'subsample':0.8,                  
        'colsample_bylevel':0.8,         
        'verbose':False,                  
        'leaf_estimation_iterations':10,       
        'random_seed':seed,
        #"task_type": "GPU",
    }
    model_weight_dict = {'lightgbm': 0.5,'catboost':0.5}
    #model_weight_dict = {'lightgbm': 1}

class is_kaggle:
    def __init__(self, Kaggle):
        if Kaggle == "Yes":
            self.path = Path(CFG.kaggle_directory)
            CFG.MODEL_DATA_PATH = Path('/kaggle/input/05061800/models')
        else:
            self.path = Path(CFG.home_directory)
            CFG.MODEL_DATA_PATH = Path(CFG.home_directory) / 'models'

def create_timestamped_file():
    tz_tokyo = pytz.timezone('Asia/Tokyo')
    now = datetime.datetime.now(tz=tz_tokyo)
    filename = now.strftime('%m%d-%H%M') + '.txt'
    full_path = CFG.MODEL_DATA_PATH / filename
    full_path.touch()

#create_timestamped_file()
cfg_instance = CFG()      
selector = is_kaggle("No")

ROOT = selector.path
TRAIN_DIR       = ROOT / "parquet_files/train"
TEST_DIR        = ROOT / "parquet_files/test"
SAMPLE_SUB = ROOT / "sample_submission.csv"

In [3]:
class Utility:
    @staticmethod
    def get_feat_defs(ending_with: str) -> None:
        """
        Retrieves feature definitions from a CSV file based on the specified ending.

        Args:
        - ending_with (str): Ending to filter feature definitions.

        Returns:
        - pl.DataFrame: Filtered feature definitions.
        """
        feat_defs: pl.DataFrame = pl.read_csv(ROOT / "feature_definitions.csv")

        filtered_feats: pl.DataFrame = feat_defs.filter(
            pl.col("Variable").apply(lambda var: var.endswith(ending_with))
        )

        with pl.Config(fmt_str_lengths=200, tbl_rows=-1):
            print(filtered_feats)

        filtered_feats = None
        feat_defs = None

    @staticmethod
    def find_index(lst: list[Any], item: Any) -> int | None:
        """
        Finds the index of an item in a list.

        Args:
        - lst (list): List to search.
        - item (Any): Item to find in the list.

        Returns:
        - int | None: Index of the item if found, otherwise None.
        """
        try:
            return lst.index(item)
        except ValueError:
            return None

    @staticmethod
    def dtype_to_str(dtype: pl.DataType) -> str:
        """
        Converts Polars data type to string representation.

        Args:
        - dtype (pl.DataType): Polars data type.

        Returns:
        - str: String representation of the data type.
        """
        dtype_map = {
            pl.Decimal: "Decimal",
            pl.Float32: "Float32",
            pl.Float64: "Float64",
            pl.UInt8: "UInt8",
            pl.UInt16: "UInt16",
            pl.UInt32: "UInt32",
            pl.UInt64: "UInt64",
            pl.Int8: "Int8",
            pl.Int16: "Int16",
            pl.Int32: "Int32",
            pl.Int64: "Int64",
            pl.Date: "Date",
            pl.Datetime: "Datetime",
            pl.Duration: "Duration",
            pl.Time: "Time",
            pl.Array: "Array",
            pl.List: "List",
            pl.Struct: "Struct",
            pl.String: "String",
            pl.Categorical: "Categorical",
            pl.Enum: "Enum",
            pl.Utf8: "Utf8",
            pl.Binary: "Binary",
            pl.Boolean: "Boolean",
            pl.Null: "Null",
            pl.Object: "Object",
            pl.Unknown: "Unknown",
        }

        return dtype_map.get(dtype)

    @staticmethod
    def find_feat_occur(regex_path: str, ending_with: str) -> pl.DataFrame:
        """
        Finds occurrences of features ending with a specific string in Parquet files.

        Args:
        - regex_path (str): Regular expression to match Parquet file paths.
        - ending_with (str): Ending to filter feature names.

        Returns:
        - pl.DataFrame: DataFrame containing feature definitions, data types, and file locations.
        """
        feat_defs: pl.DataFrame = pl.read_csv(ROOT / "feature_definitions.csv").filter(
            pl.col("Variable").apply(lambda var: var.endswith(ending_with))
        )
        feat_defs.sort(by=["Variable"])

        feats: list[pl.String] = feat_defs["Variable"].to_list()
        feats.sort()

        occurrences: list[list] = [[set(), set()] for _ in range(feat_defs.height)]

        for path in glob(str(regex_path)):
            df_schema: dict = pl.read_parquet_schema(path)

            for feat, dtype in df_schema.items():
                index: int = Utility.find_index(feats, feat)
                if index != None:
                    occurrences[index][0].add(Utility.dtype_to_str(dtype))
                    occurrences[index][1].add(Path(path).stem)

        data_types: list[str] = [None] * feat_defs.height
        file_locs: list[str] = [None] * feat_defs.height

        for i, feat in enumerate(feats):
            data_types[i] = list(occurrences[i][0])
            file_locs[i] = list(occurrences[i][1])

        feat_defs = feat_defs.with_columns(pl.Series(data_types).alias("Data_Type(s)"))
        feat_defs = feat_defs.with_columns(pl.Series(file_locs).alias("File_Loc(s)"))

        return feat_defs

    def reduce_memory_usage(df: pl.DataFrame, name) -> pl.DataFrame:
        """
        Reduces memory usage of a DataFrame by converting column types.

        Args:
        - df (pl.DataFrame): DataFrame to optimize.
        - name (str): Name of the DataFrame.

        Returns:
        - pl.DataFrame: Optimized DataFrame.
        """
        print(
            f"Memory usage of dataframe \"{name}\" is {round(df.estimated_size('mb'), 4)} MB."
        )

        int_types = [
            pl.Int8,
            pl.Int16,
            pl.Int32,
            pl.Int64,
            pl.UInt8,
            pl.UInt16,
            pl.UInt32,
            pl.UInt64,
        ]
        float_types = [pl.Float32, pl.Float64]

        for col in df.columns:
            col_type = df[col].dtype
            if col_type in int_types + float_types:
                c_min = df[col].min()
                c_max = df[col].max()

                if c_min is not None and c_max is not None:
                    if col_type in int_types:
                        if c_min >= 0:
                            if (
                                c_min >= np.iinfo(np.uint8).min
                                and c_max <= np.iinfo(np.uint8).max
                            ):
                                df = df.with_columns(df[col].cast(pl.UInt8))
                            elif (
                                c_min >= np.iinfo(np.uint16).min
                                and c_max <= np.iinfo(np.uint16).max
                            ):
                                df = df.with_columns(df[col].cast(pl.UInt16))
                            elif (
                                c_min >= np.iinfo(np.uint32).min
                                and c_max <= np.iinfo(np.uint32).max
                            ):
                                df = df.with_columns(df[col].cast(pl.UInt32))
                            elif (
                                c_min >= np.iinfo(np.uint64).min
                                and c_max <= np.iinfo(np.uint64).max
                            ):
                                df = df.with_columns(df[col].cast(pl.UInt64))
                        else:
                            if (
                                c_min >= np.iinfo(np.int8).min
                                and c_max <= np.iinfo(np.int8).max
                            ):
                                df = df.with_columns(df[col].cast(pl.Int8))
                            elif (
                                c_min >= np.iinfo(np.int16).min
                                and c_max <= np.iinfo(np.int16).max
                            ):
                                df = df.with_columns(df[col].cast(pl.Int16))
                            elif (
                                c_min >= np.iinfo(np.int32).min
                                and c_max <= np.iinfo(np.int32).max
                            ):
                                df = df.with_columns(df[col].cast(pl.Int32))
                            elif (
                                c_min >= np.iinfo(np.int64).min
                                and c_max <= np.iinfo(np.int64).max
                            ):
                                df = df.with_columns(df[col].cast(pl.Int64))
                    elif col_type in float_types:
                        if (
                            c_min > np.finfo(np.float32).min
                            and c_max < np.finfo(np.float32).max
                        ):
                            df = df.with_columns(df[col].cast(pl.Float32))

        print(
            f"Memory usage of dataframe \"{name}\" became {round(df.estimated_size('mb'), 4)} MB."
        )

        return df

    def to_pandas(df: pl.DataFrame, cat_cols: list[str] = None) -> (pd.DataFrame, list[str]):  # type: ignore
        """
        Converts a Polars DataFrame to a Pandas DataFrame.

        Args:
        - df (pl.DataFrame): Polars DataFrame to convert.
        - cat_cols (list[str]): List of categorical columns. Default is None.

        Returns:
        - (pd.DataFrame, list[str]): Tuple containing the converted Pandas DataFrame and categorical columns.
        """
        df: pd.DataFrame = df.to_pandas()

        if cat_cols is None:
            cat_cols = list(df.select_dtypes("object").columns)

        df[cat_cols] = df[cat_cols].astype("str")

        return df, cat_cols

In [4]:
# feat_defs:pl.DataFrame = Utility.find_feat_occur(TRAIN_DIR / "train_*.parquet", "P")
# feat_defs:pl.DataFrame = Utility.find_feat_occur(TRAIN_DIR / "train_*.parquet", "M")
# feat_defs:pl.DataFrame = Utility.find_feat_occur(TRAIN_DIR / "train_*.parquet", "A")
# feat_defs:pl.DataFrame = Utility.find_feat_occur(TRAIN_DIR / "train_*.parquet", "D")
# feat_defs:pl.DataFrame = Utility.find_feat_occur(TRAIN_DIR / "train_*.parquet", "T")
# feat_defs:pl.DataFrame = Utility.find_feat_occur(TRAIN_DIR / "train_*.parquet", "L")
# feat_defs:pl.DataFrame = pl.read_csv(ROOT / "feature_definitions.csv")
# with pl.Config(fmt_str_lengths=1000, tbl_rows=-1, tbl_width_chars=180):
#     print(feat_defs)

In [5]:
class Aggregator:
    @staticmethod
    def max_expr(df: pl.LazyFrame) -> list[pl.Series]:
        """
        Generates expressions for calculating maximum values for specific columns.

        Args:
        - df (pl.LazyFrame): Input LazyFrame.

        Returns:
        - list[pl.Series]: List of expressions for maximum values.
        """
        cols: list[str] = [
            col
            for col in df.columns
            if (col[-1] in ("P", "M", "A", "D", "T", "L")) or ("num_group" in col)
        ]

        expr_max: list[pl.Series] = [
            pl.col(col).max().alias(f"max_{col}") for col in cols
        ]

        return expr_max

    @staticmethod
    def min_expr(df: pl.LazyFrame) -> list[pl.Series]:
        """
        Generates expressions for calculating minimum values for specific columns.

        Args:
        - df (pl.LazyFrame): Input LazyFrame.

        Returns:
        - list[pl.Series]: List of expressions for minimum values.
        """
        cols: list[str] = [
            col
            for col in df.columns
            if (col[-1] in ("P", "M", "A", "D", "T", "L")) or ("num_group" in col)
        ]

        expr_min: list[pl.Series] = [
            pl.col(col).min().alias(f"min_{col}") for col in cols
        ]

        return expr_min

    @staticmethod
    def mean_expr(df: pl.LazyFrame) -> list[pl.Series]:
        """
        Generates expressions for calculating mean values for specific columns.

        Args:
        - df (pl.LazyFrame): Input LazyFrame.

        Returns:
        - list[pl.Series]: List of expressions for mean values.
        """
        cols: list[str] = [col for col in df.columns if col.endswith(("P", "A", "D"))]

        expr_mean: list[pl.Series] = [
            pl.col(col).mean().alias(f"mean_{col}") for col in cols
        ]

        return expr_mean

    @staticmethod
    def var_expr(df: pl.LazyFrame) -> list[pl.Series]:
        """
        Generates expressions for calculating variance for specific columns.

        Args:
        - df (pl.LazyFrame): Input LazyFrame.

        Returns:
        - list[pl.Series]: List of expressions for variance.
        """
        cols: list[str] = [col for col in df.columns if col.endswith(("P", "A", "D"))]

        expr_mean: list[pl.Series] = [
            pl.col(col).var().alias(f"var_{col}") for col in cols
        ]

        return expr_mean

    @staticmethod
    def mode_expr(df: pl.LazyFrame) -> list[pl.Series]:
        """
        Generates expressions for calculating mode values for specific columns.

        Args:
        - df (pl.LazyFrame): Input LazyFrame.

        Returns:
        - list[pl.Series]: List of expressions for mode values.
        """
        cols: list[str] = [col for col in df.columns if col.endswith("M")]

        expr_mode: list[pl.Series] = [
            pl.col(col).drop_nulls().mode().first().alias(f"mode_{col}") for col in cols
        ]

        return expr_mode

    @staticmethod
    def get_exprs(df: pl.LazyFrame) -> list[pl.Series]:
        """
        Combines expressions for maximum, mean, and variance calculations.

        Args:
        - df (pl.LazyFrame): Input LazyFrame.

        Returns:
        - list[pl.Series]: List of combined expressions.
        """
        exprs = (
            Aggregator.max_expr(df) + Aggregator.mean_expr(df) + Aggregator.var_expr(df)
        )

        return exprs

In [6]:
class SchemaGen:
    @staticmethod
    def change_dtypes(df: pl.LazyFrame) -> pl.LazyFrame:
        """
        Changes the data types of columns in the DataFrame.

        Args:
        - df (pl.LazyFrame): Input LazyFrame.

        Returns:
        - pl.LazyFrame: LazyFrame with modified data types.
        """
        for col in df.columns:
            if col == "case_id":
                df = df.with_columns(pl.col(col).cast(pl.UInt32).alias(col))
            elif col in ["WEEK_NUM", "num_group1", "num_group2"]:
                df = df.with_columns(pl.col(col).cast(pl.UInt16).alias(col))
            elif col == "date_decision" or col[-1] == "D":
                df = df.with_columns(pl.col(col).cast(pl.Date).alias(col))
            elif col[-1] in ["P", "A"]:
                df = df.with_columns(pl.col(col).cast(pl.Float64).alias(col))
            elif col[-1] in ("M",):
                df = df.with_columns(pl.col(col).cast(pl.String))
        return df

    @staticmethod
    def scan_files(glob_path: str, depth: int = None):
        chunks = []
        for path in glob(str(glob_path)):
            df = pl.read_parquet(path, low_memory=True, rechunk=True)
            df = df.pipe(SchemaGen.change_dtypes)
            if depth in [1, 2]:
                df = df.group_by("case_id").agg(Aggregator.get_exprs(df))
            chunks.append(df)
        df = pl.concat(chunks, how="vertical_relaxed")
        del chunks
        gc.collect()
        df = df.unique(subset=["case_id"])
        return df

    @staticmethod
    def join_dataframes(df_base, depth_0, depth_1, depth_2):
        for i, df in enumerate(depth_0 + depth_1 + depth_2):
            df_base = df_base.join(df, how="left", on="case_id", suffix=f"_{i}")
        return df_base


In [7]:
def filter_cols(df: pl.DataFrame) -> pl.DataFrame:
    """
    Filters columns in the DataFrame based on null percentage and unique values for string columns.

    Args:
    - df (pl.DataFrame): Input DataFrame.

    Returns:
    - pl.DataFrame: DataFrame with filtered columns.
    """
    for col in df.columns:
        if col not in ["case_id", "year", "month", "week_num", "target"]:
            null_pct = df[col].is_null().mean()

            if null_pct > 0.95:
                df = df.drop(col)

    for col in df.columns:
        if (col not in ["case_id", "year", "month", "week_num", "target"]) & (
            df[col].dtype == pl.String
        ):
            freq = df[col].n_unique()

            if (freq > 200) | (freq == 1):
                df = df.drop(col)

    return df


def transform_cols(df: pl.DataFrame) -> pl.DataFrame:
    """
    Transforms columns in the DataFrame according to predefined rules.

    Args:
    - df (pl.DataFrame): Input DataFrame.

    Returns:
    - pl.DataFrame: DataFrame with transformed columns.
    """
    if "riskassesment_302T" in df.columns:
        if df["riskassesment_302T"].dtype == pl.Null:
            df = df.with_columns(
                [
                    pl.Series(
                        "riskassesment_302T_rng", df["riskassesment_302T"], pl.UInt8
                    ),
                    pl.Series(
                        "riskassesment_302T_mean", df["riskassesment_302T"], pl.UInt8
                    ),
                ]
            )
        else:
            pct_low: pl.Series = (
                df["riskassesment_302T"]
                .str.split(" - ")
                .apply(lambda x: x[0].replace("%", ""))
                .cast(pl.UInt8)
            )
            pct_high: pl.Series = (
                df["riskassesment_302T"]
                .str.split(" - ")
                .apply(lambda x: x[1].replace("%", ""))
                .cast(pl.UInt8)
            )

            diff: pl.Series = pct_high - pct_low
            avg: pl.Series = ((pct_low + pct_high) / 2).cast(pl.Float32)

            del pct_high, pct_low
            gc.collect()

            df = df.with_columns(
                [
                    diff.alias("riskassesment_302T_rng"),
                    avg.alias("riskassesment_302T_mean"),
                ]
            )

        df.drop("riskassesment_302T")

    return df


def handle_dates(df: pl.DataFrame) -> pl.DataFrame:
    """
    Handles date columns in the DataFrame.

    Args:
    - df (pl.DataFrame): Input DataFrame.

    Returns:
    - pl.DataFrame: DataFrame with transformed date columns.
    """
    for col in df.columns:
        if col.endswith("D"):
            df = df.with_columns(pl.col(col) - pl.col("date_decision"))
            df = df.with_columns(pl.col(col).dt.total_days().cast(pl.Int32))

    df = df.rename(
        {
            "MONTH": "month",
            "WEEK_NUM": "week_num"
        }
    )
            
    df = df.with_columns(
        [
            pl.col("date_decision").dt.year().alias("year").cast(pl.Int16),
            pl.col("date_decision").dt.day().alias("day").cast(pl.UInt8),
        ]
    )

    return df.drop("date_decision")

In [8]:
data_store: dict = {
    "df_base": SchemaGen.scan_files(TRAIN_DIR / "train_base.parquet"),
    "depth_0": [
        SchemaGen.scan_files(TRAIN_DIR / "train_static_cb_0.parquet"),
        SchemaGen.scan_files(TRAIN_DIR / "train_static_0_*.parquet"),
    ],
    "depth_1": [
        SchemaGen.scan_files(TRAIN_DIR / "train_applprev_1_*.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_tax_registry_a_1.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_tax_registry_b_1.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_tax_registry_c_1.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_credit_bureau_a_1_*.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_credit_bureau_b_1.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_other_1.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_person_1.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_deposit_1.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_debitcard_1.parquet", 1),
    ],
    "depth_2": [
        SchemaGen.scan_files(TRAIN_DIR / "train_credit_bureau_a_2_*.parquet", 2),
        SchemaGen.scan_files(TRAIN_DIR / "train_credit_bureau_b_2.parquet", 2),
    ],
}

df_train: pl.DataFrame = (
    SchemaGen.join_dataframes(**data_store)
    .pipe(filter_cols)
    .pipe(transform_cols)
    .pipe(handle_dates)
    .pipe(Utility.reduce_memory_usage, "df_train")
)

del data_store
gc.collect()

print(f"Train data shape: {df_train.shape}")
display(df_train.head(10))

Memory usage of dataframe "df_train" is 4711.2195 MB.
Memory usage of dataframe "df_train" became 2665.6302 MB.
Train data shape: (1526659, 472)


case_id,month,week_num,target,assignmentdate_238D,assignmentdate_4527235D,birthdate_574D,contractssum_5085716L,dateofbirth_337D,days120_123L,days180_256L,days30_165L,days360_512L,days90_310L,description_5085714M,education_1103M,education_88M,firstquarter_103L,fourthquarter_440L,maritalst_385M,maritalst_893M,numberofqueries_373L,pmtaverage_3A,pmtaverage_4527227A,pmtcount_4527229L,pmtcount_693L,pmtscount_423L,pmtssum_45A,requesttype_4525192L,responsedate_1012D,responsedate_4527233D,responsedate_4917613D,secondquarter_766L,thirdquarter_1082L,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,…,mean_mainoccupationinc_384A,max_amount_416A,max_num_group1_10,max_openingdate_313D,mean_amount_416A,mean_openingdate_313D,max_num_group1_11,max_openingdate_857D,mean_openingdate_857D,max_collater_typofvalofguarant_298M,max_collater_typofvalofguarant_407M,max_collater_valueofguarantee_1124L,max_collater_valueofguarantee_876L,max_collaterals_typeofguarante_359M,max_collaterals_typeofguarante_669M,max_num_group1_12,max_num_group2,max_pmts_dpd_1073P,max_pmts_dpd_303P,max_pmts_month_158T,max_pmts_month_706T,max_pmts_overdue_1140A,max_pmts_overdue_1152A,max_pmts_year_1139T,max_pmts_year_507T,max_subjectroles_name_541M,max_subjectroles_name_838M,mean_pmts_dpd_1073P,mean_pmts_dpd_303P,mean_pmts_overdue_1140A,mean_pmts_overdue_1152A,var_pmts_dpd_1073P,var_pmts_dpd_303P,var_pmts_overdue_1140A,var_pmts_overdue_1152A,year,day
u32,u32,u8,u8,i16,u8,i16,f32,i32,f32,f32,f32,f32,f32,str,str,str,f32,f32,str,str,f32,f32,f32,f32,f32,f32,f32,str,i8,u8,i8,f32,f32,f32,f32,f32,…,f32,f32,u8,i16,f32,i16,u8,i16,i16,str,str,f32,f32,str,str,u16,u8,f32,f32,f32,f32,f32,f32,f32,f32,str,str,f32,f32,f32,f32,f32,f32,f32,f32,u16,u8
728959,201906,25,0,,,-10560.0,,-10560,2.0,2.0,0.0,2.0,0.0,"""a55475b1""","""a55475b1""","""a55475b1""",4.0,0.0,"""a55475b1""","""a55475b1""",2.0,,,,,0.0,0.0,,14.0,,,2.0,0.0,,,1310.599976,…,44000.0,,,,,,,,,"""a55475b1""","""a55475b1""",6986000.0,0.0,"""c7a5ad39""","""c7a5ad39""",1,35,60.0,9.0,12.0,12.0,19977.322266,5040.226074,2020.0,2017.0,"""ab3c25cf""","""be4fd70b""",8.157895,0.5,4473.829102,280.012543,165.379807,4.5,27690198.0,1411300.0,2019,30
2664210,202002,57,0,,14.0,,,-24475,0.0,0.0,0.0,0.0,0.0,"""a55475b1""","""a55475b1""","""a55475b1""",0.0,0.0,"""a55475b1""","""a55475b1""",0.0,,6989.399902,6.0,,,,"""PENSION_6""",,14.0,,0.0,0.0,0.0,19525.599609,4093.400146,…,32000.0,,,,,,,,,"""a55475b1""","""a55475b1""",,0.0,"""c7a5ad39""","""a55475b1""",0,11,,19.0,,12.0,,11.2,,2020.0,"""ab3c25cf""","""a55475b1""",,2.714286,,1.6,,51.57143,,17.92,2020,5
692776,201905,19,0,,,-9149.0,,-9149,3.0,3.0,1.0,5.0,3.0,"""a55475b1""","""a55475b1""","""a55475b1""",2.0,1.0,"""a55475b1""","""a55475b1""",5.0,,,,,7.0,65331.082031,,14.0,,,4.0,1.0,,,2852.199951,…,100000.0,,,,,,,,,"""a55475b1""","""a55475b1""",10704186.0,,"""a55475b1""","""c7a5ad39""",2,23,3.0,,12.0,,9045.389648,,2020.0,,"""a55475b1""","""ab3c25cf""",0.363636,,963.231262,,0.623377,,6797566.0,,2019,19
30324,201909,36,0,14.0,14.0,,,-23266,1.0,1.0,1.0,1.0,1.0,"""a55475b1""","""a55475b1""","""a55475b1""",0.0,0.0,"""a55475b1""","""a55475b1""",1.0,5687.800293,5687.800293,6.0,6.0,,,"""PENSION_6""",14.0,14.0,,0.0,1.0,,,2279.600098,…,10000.0,,,,,,,,,"""a55475b1""","""a55475b1""",0.0,,"""a55475b1""","""c7a5ad39""",0,23,0.0,,12.0,,0.0,,2020.0,,"""a55475b1""","""ab3c25cf""",0.0,,0.0,,0.0,,0.0,,2019,13
243971,202008,84,0,,,,236740.9375,-9112,1.0,2.0,0.0,7.0,1.0,"""2fc785b2""","""a55475b1""","""a55475b1""",1.0,4.0,"""a7fcb6e5""","""1a19667c""",7.0,,,,,,,"""DEDUCTION_6""",,14.0,14.0,0.0,3.0,0.0,18397.599609,2513.0,…,46000.0,,,,,,,,,"""a55475b1""","""a55475b1""",0.0,0.0,"""c7a5ad39""","""c7a5ad39""",2,23,0.0,373.0,12.0,12.0,0.0,12856.200195,2021.0,2021.0,"""ab3c25cf""","""ab3c25cf""",0.0,107.035713,0.0,3715.160156,0.0,17151.740234,0.0,14827657.0,2020,12
1549124,201909,37,0,,,,,-15879,0.0,1.0,0.0,3.0,0.0,"""a55475b1""","""a55475b1""","""a55475b1""",2.0,3.0,"""a55475b1""","""a55475b1""",3.0,,,,,6.0,11248.600586,"""DEDUCTION_6""",14.0,14.0,,1.0,0.0,0.0,0.0,2511.800049,…,60000.0,,,,,,,,,"""a55475b1""","""a55475b1""",0.0,0.0,"""c7a5ad39""","""c7a5ad39""",1,35,0.0,0.0,12.0,12.0,0.0,0.0,2020.0,2020.0,"""ab3c25cf""","""ab3c25cf""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2019,22
1884828,202007,82,0,,,,1845500.0,-23189,0.0,0.0,0.0,2.0,0.0,"""2fc785b2""","""717ddd49""","""a55475b1""",4.0,2.0,"""3439d993""","""a55475b1""",2.0,,,,,,,,,,14.0,4.0,2.0,0.0,24871.427734,1768.800049,…,32000.0,,,,,,,,,"""a55475b1""","""a55475b1""",0.0,0.0,"""c7a5ad39""","""c7a5ad39""",5,35,0.0,17.0,12.0,12.0,0.0,4887.382324,2021.0,2020.0,"""ab3c25cf""","""ab3c25cf""",0.0,0.264706,0.0,115.518478,0.0,3.602504,0.0,499370.21875,2020,29
1387502,201906,22,0,,,-13548.0,,-13548,0.0,1.0,0.0,11.0,0.0,"""a55475b1""","""6b2ae0fa""","""a55475b1""",14.0,9.0,"""3439d993""","""a55475b1""",11.0,,,,,13.0,22149.345703,,14.0,,,4.0,10.0,0.0,20120.630859,4399.399902,…,70000.0,0.0,0.0,-1196.0,0.0,-1196.0,0.0,-1196.0,-1196.0,"""a55475b1""","""a55475b1""",8444000.0,214120.0,"""c7a5ad39""","""c7a5ad39""",12,35,0.0,0.0,12.0,12.0,0.0,0.0,2020.0,2019.0,"""daf49a8a""","""daf49a8a""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2019,4
2675357,202003,62,0,,13.0,,,-24269,1.0,1.0,0.0,5.0,1.0,"""a55475b1""","""6b2ae0fa""","""a55475b1""",4.0,0.0,"""3439d993""","""a55475b1""",5.0,,7503.399902,13.0,,,,"""PENSION_6""",,13.0,,4.0,2.0,0.0,99679.320312,4186.800293,…,48000.0,220.586014,1.0,-1059.0,214.089005,-1133.0,1.0,-1059.0,-1133.0,"""a55475b1""","""a55475b1""",0.0,148010.0,"""c7a5ad39""","""c7a5ad39""",21,35,0.0,19.0,12.0,12.0,0.0,79.821999,2021.0,2020.0,"""ab3c25cf""","""ab3c25cf""",0.0,0.0837,0.0,0.351639,0.0,1.590308,0.0,28.06851,2020,12
1602593,201910,43,0,,,,,-18777,4.0,6.0,4.0,6.0,4.0,"""a55475b1""","""a55475b1""","""a55475b1""",0.0,4.0,"""a55475b1""","""a55475b1""",6.0,,,,,,,"""DEDUCTION_6""",,14.0,,2.0,2.0,0.0,0.0,1015.799988,…,20000.0,,,,,,,,,"""a55475b1""","""a55475b1""",0.0,0.0,"""c7a5ad39""","""c7a5ad39""",0,35,0.0,0.0,12.0,12.0,0.0,0.0,2020.0,2016.0,"""ab3c25cf""","""ab3c25cf""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2019,29


In [9]:
import glob as glob_module
import polars as pl

def readFiles(path_pattern: str) -> pl.DataFrame:
    """
    Reads multiple Parquet files matching the given path pattern and combines them into a single DataFrame.

    Args:
    - path_pattern (str): The file path pattern to match (e.g., "/path/to/files/*.parquet").

    Returns:
    - pl.DataFrame: Combined DataFrame from all matching files.
    """
    # List of all matching file paths
    file_paths = glob_module.glob(path_pattern)
    
    # Initialize an empty list to hold DataFrames
    df_list = []
    
    # Iterate over each file path and read the Parquet file
    for file_path in file_paths:
        df = pl.read_parquet(file_path)
        df_list.append(df)
    
    # Standardize column data types across all DataFrames
    standardized_df_list = []
    for df in df_list:
        # Ensure all columns have consistent types
        for col in df.columns:
            if df[col].dtype == pl.Null:
                df = df.with_columns(pl.lit(None).cast(pl.Utf8).alias(col))  # Convert Null columns to String type
            elif df[col].dtype in [pl.Int8, pl.Int16, pl.Int32, pl.Int64, pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64]:
                df = df.with_columns(df[col].cast(pl.Float64))  # Convert integer columns to Float64
            elif df[col].dtype == pl.Boolean:
                df = df.with_columns(df[col].cast(pl.Utf8))  # Convert boolean columns to String type
            elif df[col].dtype != pl.Utf8 and df[col].dtype != pl.Float64:
                df = df.with_columns(df[col].cast(pl.Utf8))  # Convert other types to String type for consistency
        standardized_df_list.append(df)

    # Concatenate all standardized DataFrames into a single DataFrame
    combined_df = pl.concat(standardized_df_list)
    
    return combined_df
# 使用例

testing = pd.read_parquet("/Users/i.itsuki/kaggle_HomeCredit/parquet_files/train/train_base.parquet").set_index("case_id")
#testing = testing.drop(['date_decision', 'WEEK_NUM', 'MONTH'], axis=1)

creditA = readFiles("/Users/i.itsuki/kaggle_HomeCredit/parquet_files/train/train_credit_bureau_a_1_*.parquet")

"""
maxDates = (creditA.lazy()
    .select([pl.col("case_id"), pl.col("refreshdate_3813885D").str.to_datetime()])
    .group_by("case_id")
    .max()
    ).collect().to_pandas().set_index("case_id").refreshdate_3813885D.sort_index()

predictDates = maxDates - pd.to_timedelta(str(14) + " days")
testing = pd.concat([predictDates, testing], axis=1).sort_index()

dateMin = testing.refreshdate_3813885D.min()
dateMax = testing.refreshdate_3813885D.max()
firstMonday = testing.refreshdate_3813885D[testing.refreshdate_3813885D.dt.day_of_week==1].min()

dayBetween = (dateMax - firstMonday).days
dayRange = [firstMonday + pd.to_timedelta(str(i) + " days") for i in range(0, dayBetween, 7)]
testing["WEEK_NUM"] = 0

for i in range(-1, len(dayRange)):
    if i < 0 and testing.refreshdate_3813885D.min() != firstMonday:
        testing["WEEK_NUM"] += ~testing.refreshdate_3813885D.isna()
        continue


    testing["WEEK_NUM"] += testing.refreshdate_3813885D >= dayRange[i]

# クレジットデータの読み込みと処理
maxDates = (creditA.lazy()
    .select([pl.col("case_id"), pl.col("refreshdate_3813885D").str.to_datetime()])
    .group_by("case_id")
    .max()
    ).collect().to_pandas().set_index("case_id").refreshdate_3813885D.sort_index()

predictDates = maxDates - pd.to_timedelta(str(14) + " days")
testing = pd.concat([predictDates, testing], axis=1).sort_index()

dateMin = testing.refreshdate_3813885D.min()
dateMax = testing.refreshdate_3813885D.max()
firstMonday = testing.refreshdate_3813885D[testing.refreshdate_3813885D.dt.day_of_week==1].min()



# 最初の月曜日から最大日付までの範囲で、週ごとの日付リストを作成
dayBetween = (dateMax - firstMonday).days
dayRange = [firstMonday + pd.to_timedelta(str(i) + " days") for i in range(0, dayBetween, 7)]

# WEEK_NUM列の初期化と計算
testing["WEEK_NUM"] = 0
for i in range(-1, len(dayRange)):
    if i < 0 and testing.refreshdate_3813885D.min() != firstMonday:
        testing["WEEK_NUM"] += ~testing.refreshdate_3813885D.isna()
        continue
    testing["WEEK_NUM"] += testing.refreshdate_3813885D >= dayRange[i]

# case_idがインデックスとして設定されている場合、リセットする
if 'case_id' not in testing.columns:
    testing = testing.reset_index()

if 'case_id' not in df_train.columns:
    df_train = df_train.reset_index()

df_train = df_train.to_pandas()

# データフレームをcase_idでマージ
merged_df = testing[['case_id', 'WEEK_NUM']].merge(df_train[['case_id', 'week_num']], on='case_id')

# WEEK_NUM列の比較
merged_df['WEEK_NUM_match'] = merged_df['WEEK_NUM'] == merged_df['week_num']

# 結果の表示
display(merged_df)

# 比較結果の集計や可視化
num_matches = merged_df['WEEK_NUM_match'].sum()
total_cases = len(merged_df)
match_percentage = num_matches / total_cases * 100

print(f'Matching WEEK_NUM cases: {num_matches} / {total_cases} ({match_percentage:.2f}%)')
"""

'\nmaxDates = (creditA.lazy()\n    .select([pl.col("case_id"), pl.col("refreshdate_3813885D").str.to_datetime()])\n    .group_by("case_id")\n    .max()\n    ).collect().to_pandas().set_index("case_id").refreshdate_3813885D.sort_index()\n\npredictDates = maxDates - pd.to_timedelta(str(14) + " days")\ntesting = pd.concat([predictDates, testing], axis=1).sort_index()\n\ndateMin = testing.refreshdate_3813885D.min()\ndateMax = testing.refreshdate_3813885D.max()\nfirstMonday = testing.refreshdate_3813885D[testing.refreshdate_3813885D.dt.day_of_week==1].min()\n\ndayBetween = (dateMax - firstMonday).days\ndayRange = [firstMonday + pd.to_timedelta(str(i) + " days") for i in range(0, dayBetween, 7)]\ntesting["WEEK_NUM"] = 0\n\nfor i in range(-1, len(dayRange)):\n    if i < 0 and testing.refreshdate_3813885D.min() != firstMonday:\n        testing["WEEK_NUM"] += ~testing.refreshdate_3813885D.isna()\n        continue\n\n\n    testing["WEEK_NUM"] += testing.refreshdate_3813885D >= dayRange[i]\n\n# ク

In [12]:
# クレジットデータの読み込みと処理
maxDates = (creditA.lazy()
    .select([pl.col("case_id"), pl.col("refreshdate_3813885D").str.to_datetime()])
    .group_by("case_id")
    .max()
    ).collect().to_pandas().set_index("case_id").refreshdate_3813885D.sort_index()

# 予測日の計算
predictDates = maxDates - pd.to_timedelta(str(14) + " days")
predictDates = predictDates.rename("refreshdate_3813885D")  # 列名を変更

# `testing`データフレームに予測日付を追加
testing['refreshdate_3813885D'] = predictDates
testing = testing.reset_index().sort_index()

# refreshdate_3813885D列が正しく設定されているか確認
display(testing.columns)

# `refreshdate_3813885D`列の日付から曜日を取得
testing['day_of_week'] = testing['refreshdate_3813885D'].dt.dayofweek

# 最初の月曜日を基準にして`week_num`を計算
# 月曜日を0、日曜日を6とする
first_monday = testing[testing['day_of_week'] == 0]['refreshdate_3813885D'].min()
testing['WEEK_NUM'] = ((testing['refreshdate_3813885D'] - first_monday).dt.days // 7) + 1

# df_trainをPandas DataFrameに変換（必要な場合）
#df_train = df_train.to_pandas()

# case_idがインデックスとして設定されている場合、リセットする
if 'case_id' not in df_train.columns:
    df_train = df_train.reset_index()

# データフレームをcase_idでマージ
merged_df = testing[['case_id', 'WEEK_NUM']].merge(df_train[['case_id', 'week_num']], on='case_id')

# WEEK_NUM列の比較
merged_df['WEEK_NUM_match'] = merged_df['WEEK_NUM'] == merged_df['week_num']


# 結果の表示
display(testing[['WEEK_NUM','refreshdate_3813885D']])

# 比較結果の集計や可視化
num_matches = merged_df['WEEK_NUM_match'].sum()
total_cases = len(merged_df)
match_percentage = num_matches / total_cases * 100

print(f'Matching WEEK_NUM cases: {num_matches} / {total_cases} ({match_percentage:.2f}%)')

Index(['level_0', 'index', 'case_id', 'date_decision', 'MONTH', 'WEEK_NUM',
       'target', 'refreshdate_3813885D', 'day_of_week'],
      dtype='object')

Unnamed: 0,WEEK_NUM,refreshdate_3813885D
0,,NaT
1,,NaT
2,,NaT
3,,NaT
4,,NaT
...,...,...
1526654,36.0,2019-09-08
1526655,36.0,2019-09-08
1526656,36.0,2019-09-08
1526657,36.0,2019-09-08


Matching WEEK_NUM cases: 3741 / 1526659 (0.25%)


In [None]:
merged_df[['week_num','refreshdate_3813885D']]

In [None]:
testing['refreshdate_3813885D'].isnull().sum()

In [None]:
merged_df['WEEK_NUM_match'] = merged_df['WEEK_NUM'] == merged_df['week_num']
merged_df

In [None]:
non_matching_cases = merged_df[merged_df['WEEK_NUM_match'] == False]
print("\nNon-matching cases:")
print(non_matching_cases.head(10))

In [None]:
data_store: dict = {
    "df_base": SchemaGen.scan_files(TEST_DIR / "test_base.parquet"),
    "depth_0": [
        SchemaGen.scan_files(TEST_DIR / "test_static_cb_0.parquet"),
        SchemaGen.scan_files(TEST_DIR / "test_static_0_*.parquet"),
    ],
    "depth_1": [
        SchemaGen.scan_files(TEST_DIR / "test_applprev_1_*.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_tax_registry_a_1.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_tax_registry_b_1.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_tax_registry_c_1.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_credit_bureau_a_1_*.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_credit_bureau_b_1.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_other_1.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_person_1.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_deposit_1.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_debitcard_1.parquet", 1),
    ],
    "depth_2": [
        SchemaGen.scan_files(TEST_DIR / "test_credit_bureau_a_2_*.parquet", 2),
        SchemaGen.scan_files(TEST_DIR / "test_credit_bureau_b_2.parquet", 2),
    ],
}

df_test: pl.DataFrame = (
    SchemaGen.join_dataframes(**data_store)
    .pipe(transform_cols)
    .pipe(handle_dates)
    .select([col for col in df_train.columns if col != "target"])
    .pipe(Utility.reduce_memory_usage, "df_test")
)

del data_store
gc.collect()

print(f"Test data shape: {df_test.shape}")

In [None]:
df_test['week_num']

In [None]:
df_train['week_num'].max()

In [None]:
testing.info()

In [None]:
df_train[['case_id','week_num']]

In [None]:
merged_df = testing[['case_id', 'WEEK_NUM']].merge(df_train[['case_id', 'week_num']], on='case_id', suffixes=('_testing', '_train'))
display(merged_df.head())
merged_df['WEEK_NUM_match'] = merged_df['WEEK_NUM_testing'] == merged_df['WEEK_NUM_train']


In [None]:
num_matches = merged_df['WEEK_NUM_match'].sum()
total_cases = len(merged_df)
match_percentage = num_matches / total_cases * 100

print(f'Matching WEEK_NUM cases: {num_matches} / {total_cases} ({match_percentage:.2f}%)')


In [None]:
df_train, cat_cols = Utility.to_pandas(df_train)
df_test, cat_cols = Utility.to_pandas(df_test, cat_cols)

In [None]:
class VotingModel(BaseEstimator, ClassifierMixin):
    """
    A voting ensemble model that combines predictions from multiple estimators.

    Parameters:
    - estimators (list): List of base estimators.

    Attributes:
    - estimators (list): List of base estimators.

    Methods:
    - fit(X, y=None): Fit the model to the training data.
    - predict(X): Predict class labels for samples.
    - predict_proba(X): Predict class probabilities for samples.
    """

    def __init__(self, estimators: list[BaseEstimator]):
        """
        Initialize the VotingModel with a list of base estimators.

        Args:
        - estimators (list): List of base estimators.
        """
        super().__init__()
        self.estimators = estimators

    def fit(self, X, y=None):
        """
        Fit the model to the training data.

        Args:
        - X: Input features.
        - y: Target labels (ignored).

        Returns:
        - self: Returns the instance itself.
        """
        return self

    def predict(self, X):
        """
        Predict class labels for samples.

        Args:
        - X: Input features.

        Returns:
        - numpy.ndarray: Predicted class labels.
        """
        y_preds = [estimator.predict(X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)

    def predict_proba(self, X):
        """
        Predict class probabilities for samples.

        Args:
        - X: Input features.

        Returns:
        - numpy.ndarray: Predicted class probabilities.
        """
        y_preds = [estimator.predict_proba(X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)

In [None]:
df_subm: pd.DataFrame = pd.read_csv(ROOT / "sample_submission.csv")
df_subm = df_subm.set_index("case_id")

device: str = "gpu"
est_cnt: int = 6000

DRY_RUN = True if df_subm.shape[0] == 10 else False
if DRY_RUN:
    device = "cpu"
    df_train = df_train.iloc[:50000]
    est_cnt: int = 600

print(device)

In [None]:
X = df_train.drop(columns=["target", "case_id", "week_num"])
y = df_train["target"]

weeks = df_train["week_num"]

#del df_train
gc.collect()

cv = StratifiedGroupKFold(n_splits=5, shuffle=False)

params1 = {
    "boosting_type": "gbdt",
    "colsample_bynode": 0.8,
    "colsample_bytree": 0.8,
    "device": device,
    "extra_trees": True,
    "learning_rate": 0.05,
    "l1_regularization": 0.1,
    "l2_regularization": 10,
    "max_depth": 20,
    "metric": "auc",
    "n_estimators": 2000,
    "num_leaves": 64,
    "objective": "binary",
    "random_state": 42,
    "verbose": -1,
}

params2 = {
    "boosting_type": "gbdt",
    "colsample_bynode": 0.8,
    "colsample_bytree": 0.8,
    "device": device,
    "extra_trees": True,
    "learning_rate": 0.03,
    "l1_regularization": 0.1,
    "l2_regularization": 10,
    "max_depth": 16,
    "metric": "auc",
    "n_estimators": 2000,
    "num_leaves": 54,
    "objective": "binary",
    "random_state": 42,
    "verbose": -1,
}

fitted_models_cat = []
fitted_models_lgb = []

cv_scores_cat = []
cv_scores_lgb = []

iter_cnt = 0
for idx_train, idx_valid in cv.split(X, y, groups=weeks):
    X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
    X_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]
    '''
    train_pool = Pool(X_train, y_train, cat_features=cat_cols)
    val_pool = Pool(X_valid, y_valid, cat_features=cat_cols)

    clf = CatBoostClassifier(
        best_model_min_trees = 1000,
        boosting_type = "Plain",
        eval_metric = "AUC",
        iterations = est_cnt,
        learning_rate = 0.05,
        l2_leaf_reg = 10,
        max_leaves = 64,
        random_seed = 42,
        task_type = "GPU",
        use_best_model = True
    )

    clf.fit(train_pool, eval_set=val_pool, verbose=False)
    fitted_models_cat.append(clf)

    y_pred_valid = clf.predict_proba(X_valid)[:, 1]
    auc_score = roc_auc_score(y_valid, y_pred_valid)
    cv_scores_cat.append(auc_score)
    '''
    X_train[cat_cols] = X_train[cat_cols].astype("category")
    X_valid[cat_cols] = X_valid[cat_cols].astype("category")

    if iter_cnt % 2 == 0:
        model = lgb.LGBMClassifier(**params1)
    else:
        model = lgb.LGBMClassifier(**params2)

    model.fit(
        X_train,
        y_train,
        eval_set=[(X_valid, y_valid)],
        callbacks=[lgb.log_evaluation(100), lgb.early_stopping(100)],
    )
    fitted_models_lgb.append(model)

    y_pred_valid = model.predict_proba(X_valid)[:, 1]
    auc_score = roc_auc_score(y_valid, y_pred_valid)
    cv_scores_lgb.append(auc_score)

    iter_cnt += 1

model = VotingModel(fitted_models_cat + fitted_models_lgb)

#print(f"\nCV AUC scores for CatBoost: {cv_scores_cat}")
#print(f"Maximum CV AUC score for Catboost: {max(cv_scores_cat)}", end="\n\n")


print(f"CV AUC scores for LGBM: {cv_scores_lgb}")
print(f"Maximum CV AUC score for LGBM: {max(cv_scores_lgb)}", end="\n\n")

del X, y
gc.collect()

Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.829186
[200]	valid_0's auc: 0.830062
Early stopping, best iteration is:
[143]	valid_0's auc: 0.832625
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.817886
[200]	valid_0's auc: 0.824315
[300]	valid_0's auc: 0.825302
Early stopping, best iteration is:
[277]	valid_0's auc: 0.82588
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.832977
[200]	valid_0's auc: 0.836605
[300]	valid_0's auc: 0.838039
Early stopping, best iteration is:
[288]	valid_0's auc: 0.838424
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.798351
[200]	valid_0's auc: 0.812143
[300]	valid_0's auc: 0.812793
Early stopping, best iteration is:
[251]	valid_0's auc: 0.81398
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.806045
...
[152]	valid_0's auc: 0.808167
CV AUC scores for LGBM: [0.832625157557228, 0.8258797409515442, 0.8384238678191509, 0.8139802713768391, 0.8081673991999244]
Maximum CV AUC score for LGBM: 0.8384238678191509

In [None]:
X_test: pd.DataFrame = df_test.drop(columns=["week_num"]).set_index("case_id")

X_test[cat_cols] = X_test[cat_cols].astype("category")

y_pred: pd.Series = pd.Series(model.predict_proba(X_test)[:, 1], index=X_test.index)

df_subm["score"] = y_pred

display(df_subm)

df_subm.to_csv("submission.csv")

del X_test, y_pred, df_subm
gc.collect()