In [None]:
# %% [code] {"execution":{"iopub.status.busy":"2024-05-24T08:32:28.899510Z","iopub.execute_input":"2024-05-24T08:32:28.900611Z","iopub.status.idle":"2024-05-24T08:32:34.168765Z","shell.execute_reply.started":"2024-05-24T08:32:28.900567Z","shell.execute_reply":"2024-05-24T08:32:34.167481Z"},"jupyter":{"outputs_hidden":false}}
import gc
import lightgbm as lgb  # type: ignore
import numpy as np  # type: ignore
import pandas as pd  # type: ignore
import polars as pl  # type: ignore
import warnings
import os
import shutil

from catboost import CatBoostClassifier, Pool  # type: ignore
from glob import glob
import joblib
from IPython.display import display  # type: ignore
from pathlib import Path
from sklearn.base import BaseEstimator, ClassifierMixin  # type: ignore
from sklearn.metrics import roc_auc_score  # type: ignore
from sklearn.model_selection import StratifiedGroupKFold  # type: ignore
from typing import Any
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier

warnings.filterwarnings("ignore")

ROOT = Path("/kaggle/input/home-credit-credit-risk-model-stability")
TRAIN_DIR = ROOT / "parquet_files" / "train"
TEST_DIR = ROOT / "parquet_files" / "test"

# %% [code] {"execution":{"iopub.status.busy":"2024-05-24T08:32:34.171165Z","iopub.execute_input":"2024-05-24T08:32:34.171864Z","iopub.status.idle":"2024-05-24T08:32:34.338974Z","shell.execute_reply.started":"2024-05-24T08:32:34.171821Z","shell.execute_reply":"2024-05-24T08:32:34.337456Z"},"jupyter":{"outputs_hidden":false}}
class Utility:
    @staticmethod
    def get_feat_defs(ending_with: str) -> None:
        """
        Retrieves feature definitions from a CSV file based on the specified ending.

        Args:
        - ending_with (str): Ending to filter feature definitions.

        Returns:
        - pl.DataFrame: Filtered feature definitions.
        """
        feat_defs: pl.DataFrame = pl.read_csv(ROOT / "feature_definitions.csv")

        filtered_feats: pl.DataFrame = feat_defs.filter(
            pl.col("Variable").apply(lambda var: var.endswith(ending_with))
        )

        with pl.Config(fmt_str_lengths=200, tbl_rows=-1):
            print(filtered_feats)

        filtered_feats = None
        feat_defs = None

    @staticmethod
    def find_index(lst: list[Any], item: Any) -> int | None:
        """
        Finds the index of an item in a list.

        Args:
        - lst (list): List to search.
        - item (Any): Item to find in the list.

        Returns:
        - int | None: Index of the item if found, otherwise None.
        """
        try:
            return lst.index(item)
        except ValueError:
            return None

    @staticmethod
    def dtype_to_str(dtype: pl.DataType) -> str:
        """
        Converts Polars data type to string representation.

        Args:
        - dtype (pl.DataType): Polars data type.

        Returns:
        - str: String representation of the data type.
        """
        dtype_map = {
            pl.Decimal: "Decimal",
            pl.Float32: "Float32",
            pl.Float64: "Float64",
            pl.UInt8: "UInt8",
            pl.UInt16: "UInt16",
            pl.UInt32: "UInt32",
            pl.UInt64: "UInt64",
            pl.Int8: "Int8",
            pl.Int16: "Int16",
            pl.Int32: "Int32",
            pl.Int64: "Int64",
            pl.Date: "Date",
            pl.Datetime: "Datetime",
            pl.Duration: "Duration",
            pl.Time: "Time",
            pl.Array: "Array",
            pl.List: "List",
            pl.Struct: "Struct",
            pl.String: "String",
            pl.Categorical: "Categorical",
            pl.Enum: "Enum",
            pl.Utf8: "Utf8",
            pl.Binary: "Binary",
            pl.Boolean: "Boolean",
            pl.Null: "Null",
            pl.Object: "Object",
            pl.Unknown: "Unknown",
        }

        return dtype_map.get(dtype)

    @staticmethod
    def find_feat_occur(regex_path: str, ending_with: str) -> pl.DataFrame:
        """
        Finds occurrences of features ending with a specific string in Parquet files.

        Args:
        - regex_path (str): Regular expression to match Parquet file paths.
        - ending_with (str): Ending to filter feature names.

        Returns:
        - pl.DataFrame: DataFrame containing feature definitions, data types, and file locations.
        """
        feat_defs: pl.DataFrame = pl.read_csv(ROOT / "feature_definitions.csv").filter(
            pl.col("Variable").apply(lambda var: var.endswith(ending_with))
        )
        feat_defs.sort(by=["Variable"])

        feats: list[pl.String] = feat_defs["Variable"].to_list()
        feats.sort()

        occurrences: list[list] = [[set(), set()] for _ in range(feat_defs.height)]

        for path in glob(str(regex_path)):
            df_schema: dict = pl.read_parquet_schema(path)

            for feat, dtype in df_schema.items():
                index: int = Utility.find_index(feats, feat)
                if index != None:
                    occurrences[index][0].add(Utility.dtype_to_str(dtype))
                    occurrences[index][1].add(Path(path).stem)

        data_types: list[str] = [None] * feat_defs.height
        file_locs: list[str] = [None] * feat_defs.height

        for i, feat in enumerate(feats):
            data_types[i] = list(occurrences[i][0])
            file_locs[i] = list(occurrences[i][1])

        feat_defs = feat_defs.with_columns(pl.Series(data_types).alias("Data_Type(s)"))
        feat_defs = feat_defs.with_columns(pl.Series(file_locs).alias("File_Loc(s)"))

        return feat_defs

    def reduce_memory_usage(df: pl.DataFrame, name) -> pl.DataFrame:
        """
        Reduces memory usage of a DataFrame by converting column types.

        Args:
        - df (pl.DataFrame): DataFrame to optimize.
        - name (str): Name of the DataFrame.

        Returns:
        - pl.DataFrame: Optimized DataFrame.
        """
        print(
            f"Memory usage of dataframe \"{name}\" is {round(df.estimated_size('mb'), 4)} MB."
        )

        int_types = [
            pl.Int8,
            pl.Int16,
            pl.Int32,
            pl.Int64,
            pl.UInt8,
            pl.UInt16,
            pl.UInt32,
            pl.UInt64,
        ]
        float_types = [pl.Float32, pl.Float64]

        for col in df.columns:
            col_type = df[col].dtype
            if col_type in int_types + float_types:
                c_min = df[col].min()
                c_max = df[col].max()

                if c_min is not None and c_max is not None:
                    if col_type in int_types:
                        if c_min >= 0:
                            if (
                                c_min >= np.iinfo(np.uint8).min
                                and c_max <= np.iinfo(np.uint8).max
                            ):
                                df = df.with_columns(df[col].cast(pl.UInt8))
                            elif (
                                c_min >= np.iinfo(np.uint16).min
                                and c_max <= np.iinfo(np.uint16).max
                            ):
                                df = df.with_columns(df[col].cast(pl.UInt16))
                            elif (
                                c_min >= np.iinfo(np.uint32).min
                                and c_max <= np.iinfo(np.uint32).max
                            ):
                                df = df.with_columns(df[col].cast(pl.UInt32))
                            elif (
                                c_min >= np.iinfo(np.uint64).min
                                and c_max <= np.iinfo(np.uint64).max
                            ):
                                df = df.with_columns(df[col].cast(pl.UInt64))
                        else:
                            if (
                                c_min >= np.iinfo(np.int8).min
                                and c_max <= np.iinfo(np.int8).max
                            ):
                                df = df.with_columns(df[col].cast(pl.Int8))
                            elif (
                                c_min >= np.iinfo(np.int16).min
                                and c_max <= np.iinfo(np.int16).max
                            ):
                                df = df.with_columns(df[col].cast(pl.Int16))
                            elif (
                                c_min >= np.iinfo(np.int32).min
                                and c_max <= np.iinfo(np.int32).max
                            ):
                                df = df.with_columns(df[col].cast(pl.Int32))
                            elif (
                                c_min >= np.iinfo(np.int64).min
                                and c_max <= np.iinfo(np.int64).max
                            ):
                                df = df.with_columns(df[col].cast(pl.Int64))
                    elif col_type in float_types:
                        if (
                            c_min > np.finfo(np.float32).min
                            and c_max < np.finfo(np.float32).max
                        ):
                            df = df.with_columns(df[col].cast(pl.Float32))

        print(
            f"Memory usage of dataframe \"{name}\" became {round(df.estimated_size('mb'), 4)} MB."
        )

        return df

    def to_pandas(df: pl.DataFrame, cat_cols: list[str] = None) -> (pd.DataFrame, list[str]):  # type: ignore
        """
        Converts a Polars DataFrame to a Pandas DataFrame.

        Args:
        - df (pl.DataFrame): Polars DataFrame to convert.
        - cat_cols (list[str]): List of categorical columns. Default is None.

        Returns:
        - (pd.DataFrame, list[str]): Tuple containing the converted Pandas DataFrame and categorical columns.
        """
        df: pd.DataFrame = df.to_pandas()

        if cat_cols is None:
            cat_cols = list(df.select_dtypes("object").columns)

        df[cat_cols] = df[cat_cols].astype("str")

        return df, cat_cols

# %% [code] {"execution":{"iopub.status.busy":"2024-05-24T08:32:34.340577Z","iopub.execute_input":"2024-05-24T08:32:34.341140Z","iopub.status.idle":"2024-05-24T08:32:34.363081Z","shell.execute_reply.started":"2024-05-24T08:32:34.341079Z","shell.execute_reply":"2024-05-24T08:32:34.361494Z"},"jupyter":{"outputs_hidden":false}}
# feat_defs:pl.DataFrame = Utility.find_feat_occur(TRAIN_DIR / "train_*.parquet", "P")
# feat_defs:pl.DataFrame = Utility.find_feat_occur(TRAIN_DIR / "train_*.parquet", "M")
# feat_defs:pl.DataFrame = Utility.find_feat_occur(TRAIN_DIR / "train_*.parquet", "A")
# feat_defs:pl.DataFrame = Utility.find_feat_occur(TRAIN_DIR / "train_*.parquet", "D")
# feat_defs:pl.DataFrame = Utility.find_feat_occur(TRAIN_DIR / "train_*.parquet", "T")
# feat_defs:pl.DataFrame = Utility.find_feat_occur(TRAIN_DIR / "train_*.parquet", "L")
# feat_defs:pl.DataFrame = pl.read_csv(ROOT / "feature_definitions.csv")
# with pl.Config(fmt_str_lengths=1000, tbl_rows=-1, tbl_width_chars=180):
#     print(feat_defs)

# %% [code] {"execution":{"iopub.status.busy":"2024-05-24T08:32:34.366890Z","iopub.execute_input":"2024-05-24T08:32:34.367697Z","iopub.status.idle":"2024-05-24T08:32:34.390519Z","shell.execute_reply.started":"2024-05-24T08:32:34.367637Z","shell.execute_reply":"2024-05-24T08:32:34.389144Z"},"jupyter":{"outputs_hidden":false}}
class Aggregator:
    @staticmethod
    def max_expr(df: pl.LazyFrame) -> list[pl.Series]:
        """
        Generates expressions for calculating maximum values for specific columns.

        Args:
        - df (pl.LazyFrame): Input LazyFrame.

        Returns:
        - list[pl.Series]: List of expressions for maximum values.
        """
        cols: list[str] = [
            col
            for col in df.columns
            if (col[-1] in ("P", "M", "A", "D", "T", "L")) or ("num_group" in col)
        ]

        expr_max: list[pl.Series] = [
            pl.col(col).max().alias(f"max_{col}") for col in cols
        ]

        return expr_max

    @staticmethod
    def min_expr(df: pl.LazyFrame) -> list[pl.Series]:
        """
        Generates expressions for calculating minimum values for specific columns.

        Args:
        - df (pl.LazyFrame): Input LazyFrame.

        Returns:
        - list[pl.Series]: List of expressions for minimum values.
        """
        cols: list[str] = [
            col
            for col in df.columns
            if (col[-1] in ("P", "M", "A", "D", "T", "L")) or ("num_group" in col)
        ]

        expr_min: list[pl.Series] = [
            pl.col(col).min().alias(f"min_{col}") for col in cols
        ]

        return expr_min

    @staticmethod
    def mean_expr(df: pl.LazyFrame) -> list[pl.Series]:
        """
        Generates expressions for calculating mean values for specific columns.

        Args:
        - df (pl.LazyFrame): Input LazyFrame.

        Returns:
        - list[pl.Series]: List of expressions for mean values.
        """
        cols: list[str] = [col for col in df.columns if col.endswith(("P", "A", "D"))]

        expr_mean: list[pl.Series] = [
            pl.col(col).mean().alias(f"mean_{col}") for col in cols
        ]

        return expr_mean

    @staticmethod
    def var_expr(df: pl.LazyFrame) -> list[pl.Series]:
        """
        Generates expressions for calculating variance for specific columns.

        Args:
        - df (pl.LazyFrame): Input LazyFrame.

        Returns:
        - list[pl.Series]: List of expressions for variance.
        """
        cols: list[str] = [col for col in df.columns if col.endswith(("P", "A", "D"))]

        expr_mean: list[pl.Series] = [
            pl.col(col).var().alias(f"var_{col}") for col in cols
        ]

        return expr_mean

    @staticmethod
    def mode_expr(df: pl.LazyFrame) -> list[pl.Series]:
        """
        Generates expressions for calculating mode values for specific columns.

        Args:
        - df (pl.LazyFrame): Input LazyFrame.

        Returns:
        - list[pl.Series]: List of expressions for mode values.
        """
        cols: list[str] = [col for col in df.columns if col.endswith("M")]

        expr_mode: list[pl.Series] = [
            pl.col(col).drop_nulls().mode().first().alias(f"mode_{col}") for col in cols
        ]

        return expr_mode

    @staticmethod
    def get_exprs(df: pl.LazyFrame) -> list[pl.Series]:
        """
        Combines expressions for maximum, mean, and variance calculations.

        Args:
        - df (pl.LazyFrame): Input LazyFrame.

        Returns:
        - list[pl.Series]: List of combined expressions.
        """
        exprs = (
            Aggregator.max_expr(df) + Aggregator.mean_expr(df) + Aggregator.var_expr(df)
        )

        return exprs

# %% [code] {"execution":{"iopub.status.busy":"2024-05-24T08:32:34.392482Z","iopub.execute_input":"2024-05-24T08:32:34.392917Z","iopub.status.idle":"2024-05-24T08:32:34.410491Z","shell.execute_reply.started":"2024-05-24T08:32:34.392884Z","shell.execute_reply":"2024-05-24T08:32:34.409025Z"},"jupyter":{"outputs_hidden":false}}
class SchemaGen:
    @staticmethod
    def change_dtypes(df: pl.LazyFrame) -> pl.LazyFrame:
        """
        Changes the data types of columns in the DataFrame.

        Args:
        - df (pl.LazyFrame): Input LazyFrame.

        Returns:
        - pl.LazyFrame: LazyFrame with modified data types.
        """
        for col in df.columns:
            if col == "case_id":
                df = df.with_columns(pl.col(col).cast(pl.UInt32).alias(col))
            elif col in ["WEEK_NUM", "num_group1", "num_group2"]:
                df = df.with_columns(pl.col(col).cast(pl.UInt16).alias(col))
            elif col == "date_decision" or col[-1] == "D":
                df = df.with_columns(pl.col(col).cast(pl.Date).alias(col))
            elif col[-1] in ["P", "A"]:
                df = df.with_columns(pl.col(col).cast(pl.Float64).alias(col))
            elif col[-1] in ("M",):
                df = df.with_columns(pl.col(col).cast(pl.String))
        return df

    @staticmethod
    def scan_files(glob_path: str, depth: int = None):
        chunks = []
        for path in glob(str(glob_path)):
            df = pl.read_parquet(path, low_memory=True, rechunk=True)
            df = df.pipe(SchemaGen.change_dtypes)
            if depth in [1, 2]:
                df = df.group_by("case_id").agg(Aggregator.get_exprs(df))
            chunks.append(df)
        df = pl.concat(chunks, how="vertical_relaxed")
        del chunks
        gc.collect()
        df = df.unique(subset=["case_id"])
        return df

    @staticmethod
    def join_dataframes(df_base, depth_0, depth_1, depth_2):
        for i, df in enumerate(depth_0 + depth_1 + depth_2):
            df_base = df_base.join(df, how="left", on="case_id", suffix=f"_{i}")
        return df_base

# %% [code] {"execution":{"iopub.status.busy":"2024-05-24T08:32:34.412300Z","iopub.execute_input":"2024-05-24T08:32:34.412825Z","iopub.status.idle":"2024-05-24T08:32:34.435625Z","shell.execute_reply.started":"2024-05-24T08:32:34.412785Z","shell.execute_reply":"2024-05-24T08:32:34.434131Z"},"jupyter":{"outputs_hidden":false}}
def filter_cols(df: pl.DataFrame) -> pl.DataFrame:
    """
    Filters columns in the DataFrame based on null percentage and unique values for string columns.

    Args:
    - df (pl.DataFrame): Input DataFrame.

    Returns:
    - pl.DataFrame: DataFrame with filtered columns.
    """
    for col in df.columns:
        if col not in ["case_id", "year", "month", "week_num", "target"]:
            null_pct = df[col].is_null().mean()

            if null_pct > 0.95:
                df = df.drop(col)

    for col in df.columns:
        if (col not in ["case_id", "year", "month", "week_num", "target"]) & (
            df[col].dtype == pl.String
        ):
            freq = df[col].n_unique()

            if (freq > 200) | (freq == 1):
                df = df.drop(col)

    return df


def transform_cols(df: pl.DataFrame) -> pl.DataFrame:
    """
    Transforms columns in the DataFrame according to predefined rules.

    Args:
    - df (pl.DataFrame): Input DataFrame.

    Returns:
    - pl.DataFrame: DataFrame with transformed columns.
    """
    if "riskassesment_302T" in df.columns:
        if df["riskassesment_302T"].dtype == pl.Null:
            df = df.with_columns(
                [
                    pl.Series(
                        "riskassesment_302T_rng", df["riskassesment_302T"], pl.UInt8
                    ),
                    pl.Series(
                        "riskassesment_302T_mean", df["riskassesment_302T"], pl.UInt8
                    ),
                ]
            )
        else:
            pct_low: pl.Series = (
                df["riskassesment_302T"]
                .str.split(" - ")
                .apply(lambda x: x[0].replace("%", ""))
                .cast(pl.UInt8)
            )
            pct_high: pl.Series = (
                df["riskassesment_302T"]
                .str.split(" - ")
                .apply(lambda x: x[1].replace("%", ""))
                .cast(pl.UInt8)
            )

            diff: pl.Series = pct_high - pct_low
            avg: pl.Series = ((pct_low + pct_high) / 2).cast(pl.Float32)

            del pct_high, pct_low
            gc.collect()

            df = df.with_columns(
                [
                    diff.alias("riskassesment_302T_rng"),
                    avg.alias("riskassesment_302T_mean"),
                ]
            )

        df.drop("riskassesment_302T")

    return df


def handle_dates(df: pl.DataFrame) -> pl.DataFrame:
    """
    Handles date columns in the DataFrame.

    Args:
    - df (pl.DataFrame): Input DataFrame.

    Returns:
    - pl.DataFrame: DataFrame with transformed date columns.
    """
    for col in df.columns:
        if col.endswith("D"):
            df = df.with_columns(pl.col(col) - pl.col("date_decision"))
            df = df.with_columns(pl.col(col).dt.total_days().cast(pl.Int32))

    df = df.rename(
        {
            "MONTH": "month",
            "WEEK_NUM": "week_num"
        }
    )
            
    df = df.with_columns(
        [
            pl.col("date_decision").dt.year().alias("year").cast(pl.Int16),
            pl.col("date_decision").dt.day().alias("day").cast(pl.UInt8),
        ]
    )

    return df.drop("date_decision")

# %% [code] {"execution":{"iopub.status.busy":"2024-05-24T08:32:34.437396Z","iopub.execute_input":"2024-05-24T08:32:34.437811Z","iopub.status.idle":"2024-05-24T08:35:54.241472Z","shell.execute_reply.started":"2024-05-24T08:32:34.437775Z","shell.execute_reply":"2024-05-24T08:35:54.240264Z"},"jupyter":{"outputs_hidden":false}}
data_store: dict = {
    "df_base": SchemaGen.scan_files(TRAIN_DIR / "train_base.parquet"),
    "depth_0": [
        SchemaGen.scan_files(TRAIN_DIR / "train_static_cb_0.parquet"),
        SchemaGen.scan_files(TRAIN_DIR / "train_static_0_*.parquet"),
    ],
    "depth_1": [
        SchemaGen.scan_files(TRAIN_DIR / "train_applprev_1_*.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_tax_registry_a_1.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_tax_registry_b_1.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_tax_registry_c_1.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_credit_bureau_a_1_*.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_credit_bureau_b_1.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_other_1.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_person_1.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_deposit_1.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_debitcard_1.parquet", 1),
    ],
    "depth_2": [
        SchemaGen.scan_files(TRAIN_DIR / "train_credit_bureau_a_2_*.parquet", 2),
        SchemaGen.scan_files(TRAIN_DIR / "train_credit_bureau_b_2.parquet", 2),
    ],
}

df_train: pl.DataFrame = (
    SchemaGen.join_dataframes(**data_store)
    .pipe(filter_cols)
    .pipe(transform_cols)
    .pipe(handle_dates)
    .pipe(Utility.reduce_memory_usage, "df_train")
)

del data_store
gc.collect()

print(f"Train data shape: {df_train.shape}")
display(df_train.head(10))

# %% [code] {"execution":{"iopub.status.busy":"2024-05-24T08:35:54.242893Z","iopub.execute_input":"2024-05-24T08:35:54.243287Z","iopub.status.idle":"2024-05-24T08:35:56.559687Z","shell.execute_reply.started":"2024-05-24T08:35:54.243256Z","shell.execute_reply":"2024-05-24T08:35:56.558389Z"},"jupyter":{"outputs_hidden":false}}
data_store: dict = {
    "df_base": SchemaGen.scan_files(TEST_DIR / "test_base.parquet"),
    "depth_0": [
        SchemaGen.scan_files(TEST_DIR / "test_static_cb_0.parquet"),
        SchemaGen.scan_files(TEST_DIR / "test_static_0_*.parquet"),
    ],
    "depth_1": [
        SchemaGen.scan_files(TEST_DIR / "test_applprev_1_*.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_tax_registry_a_1.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_tax_registry_b_1.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_tax_registry_c_1.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_credit_bureau_a_1_*.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_credit_bureau_b_1.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_other_1.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_person_1.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_deposit_1.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_debitcard_1.parquet", 1),
    ],
    "depth_2": [
        SchemaGen.scan_files(TEST_DIR / "test_credit_bureau_a_2_*.parquet", 2),
        SchemaGen.scan_files(TEST_DIR / "test_credit_bureau_b_2.parquet", 2),
    ],
}

df_test: pl.DataFrame = (
    SchemaGen.join_dataframes(**data_store)
    .pipe(transform_cols)
    .pipe(handle_dates)
    .select([col for col in df_train.columns if col != "target"])
    .pipe(Utility.reduce_memory_usage, "df_test")
)

del data_store
gc.collect()

print(f"Test data shape: {df_test.shape}")

# %% [code] {"execution":{"iopub.status.busy":"2024-05-24T08:35:56.561210Z","iopub.execute_input":"2024-05-24T08:35:56.561654Z","iopub.status.idle":"2024-05-24T08:36:22.870809Z","shell.execute_reply.started":"2024-05-24T08:35:56.561613Z","shell.execute_reply":"2024-05-24T08:36:22.869478Z"},"jupyter":{"outputs_hidden":false}}
df_train, cat_cols = Utility.to_pandas(df_train)
df_test, cat_cols = Utility.to_pandas(df_test, cat_cols)

# %% [code] {"execution":{"iopub.status.busy":"2024-05-24T08:36:22.875063Z","iopub.execute_input":"2024-05-24T08:36:22.875585Z","iopub.status.idle":"2024-05-24T08:36:22.886734Z","shell.execute_reply.started":"2024-05-24T08:36:22.875542Z","shell.execute_reply":"2024-05-24T08:36:22.885190Z"},"jupyter":{"outputs_hidden":false}}
class VotingModel(BaseEstimator, ClassifierMixin):
    """
    A voting ensemble model that combines predictions from multiple estimators.

    Parameters:
    - estimators (list): List of base estimators.

    Attributes:
    - estimators (list): List of base estimators.

    Methods:
    - fit(X, y=None): Fit the model to the training data.
    - predict(X): Predict class labels for samples.
    - predict_proba(X): Predict class probabilities for samples.
    """

    def __init__(self, estimators: list[BaseEstimator]):
        """
        Initialize the VotingModel with a list of base estimators.

        Args:
        - estimators (list): List of base estimators.
        """
        super().__init__()
        self.estimators = estimators

    def fit(self, X, y=None):
        """
        Fit the model to the training data.

        Args:
        - X: Input features.
        - y: Target labels (ignored).

        Returns:
        - self: Returns the instance itself.
        """
        return self

    def predict(self, X):
        """
        Predict class labels for samples.

        Args:
        - X: Input features.

        Returns:
        - numpy.ndarray: Predicted class labels.
        """
        y_preds = [estimator.predict(X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)

    def predict_proba(self, X):
        """
        Predict class probabilities for samples.

        Args:
        - X: Input features.

        Returns:
        - numpy.ndarray: Predicted class probabilities.
        """
        y_preds = [estimator.predict_proba(X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)



In [None]:
# Set a seed for various non-deterministic processes for reproducibility
import random
def seed_it_all(seed=7):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)

SEED = 0

# set the seed for this run
seed_it_all(SEED)

In [None]:
df_train[cat_cols] = df_train[cat_cols].astype(str)
df_test[cat_cols] = df_test[cat_cols].astype(str)

In [None]:
sample = pd.read_csv("/kaggle/input/home-credit-credit-risk-model-stability/sample_submission.csv")
device='gpu'
est_cnt=6000
DRY_RUN = True if sample.shape[0] == 10 else False   
if DRY_RUN:
    device='cpu'
    df_train = df_train.iloc[:5000]
    est_cnt=600

print(device)

# %% [code] {"execution":{"iopub.status.busy":"2024-05-24T08:36:22.920891Z","iopub.execute_input":"2024-05-24T08:36:22.921381Z","iopub.status.idle":"2024-05-24T08:36:24.421036Z","shell.execute_reply.started":"2024-05-24T08:36:22.921340Z","shell.execute_reply":"2024-05-24T08:36:24.419720Z"},"jupyter":{"outputs_hidden":false}}
X = df_train.drop(columns=["target", "case_id", "week_num"])
y = df_train["target"]

weeks = df_train["week_num"]

del df_train
gc.collect()

cv = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=SEED)

### Feature Selection

In [None]:
params_lgb = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "max_depth": 10,  
    "learning_rate": 0.05,
    "n_estimators": 2500,  
    "colsample_bytree": 0.8,
    "colsample_bynode": 0.8,
    "verbose": -1,
    "random_state": SEED,
    "reg_alpha": 0.1,
    "reg_lambda": 10,
    "extra_trees":True,
    'num_leaves':64,
    "device": device, 
}

In [None]:
params_lgb2 = {
    "boosting_type": "goss",
    "objective": "binary",
    "metric": "auc",
    "max_depth": 10,  
    "learning_rate": 0.05,
    "n_estimators": 2500,  
    "colsample_bytree": 0.8,
    "colsample_bynode": 0.8,
    "verbose": -1,
    "random_state": SEED,
    "reg_alpha": 0.1,
    "reg_lambda": 10,
    "extra_trees":True,
    'num_leaves':64,
    "device": device, 
}

In [7]:
fitted_models_cb = []
fitted_models_lgb = []
fitted_models_lgb2 = []
fitted_models_eclf = []
cv_scores_cb = []
cv_scores_lgb = []
cv_scores_lgb2 = []
cv_scores_eclf = []

for idx_train, idx_valid in cv.split(X, y, groups=weeks):#
    X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]# 
    X_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]
    train_pool = Pool(X_train, y_train, cat_features=cat_cols)
    val_pool = Pool(X_valid, y_valid, cat_features=cat_cols)
    clf_cb = CatBoostClassifier(
        best_model_min_trees = 1000,
        boosting_type = "Plain",
        eval_metric = "AUC",
        iterations = est_cnt,
        learning_rate = 0.05,
        l2_leaf_reg = 10,
        max_leaves = 64,
        random_seed = SEED,
        task_type = "GPU",
        use_best_model = True
    )
    random_seed=SEED
    clf_cb.fit(train_pool, eval_set=val_pool,verbose=300)
    fitted_models_cb.append(clf_cb)
    y_pred_valid = clf_cb.predict_proba(X_valid)[:,1]
    auc_score = roc_auc_score(y_valid, y_pred_valid)
    cv_scores_cb.append(auc_score)
    
    
    X_train[cat_cols] = X_train[cat_cols].astype("category")
    X_valid[cat_cols] = X_valid[cat_cols].astype("category")
    
    clf_lgb = LGBMClassifier(**params_lgb)
    clf_lgb.fit(
        X_train, y_train,
        eval_set = [(X_valid, y_valid)],
        callbacks = [lgb.log_evaluation(200), lgb.early_stopping(60)] )
    
    fitted_models_lgb.append(clf_lgb)
    y_pred_valid = clf_lgb.predict_proba(X_valid)[:,1]
    auc_score = roc_auc_score(y_valid, y_pred_valid)
    cv_scores_lgb.append(auc_score)
    
    clf_lgb2 = LGBMClassifier(**params_lgb2)
    clf_lgb2.fit(
        X_train, y_train,
        eval_set = [(X_valid, y_valid)],
        callbacks = [lgb.log_evaluation(200), lgb.early_stopping(60)] )
    
    fitted_models_lgb2.append(clf_lgb2)
    y_pred_valid = clf_lgb2.predict_proba(X_valid)[:,1]
    auc_score = roc_auc_score(y_valid, y_pred_valid)
    cv_scores_lgb2.append(auc_score)
 
    eclf = VotingClassifier(
     estimators=[('lgb', clf_lgb), ('lgb2', clf_lgb2)],
     voting='soft', weights=[1, 1])   
    eclf = eclf.fit(X_train, y_train)
    fitted_models_eclf.append(eclf)
    y_pred_valid = eclf.predict_proba(X_valid)[:,1]
    auc_score = roc_auc_score(y_valid, y_pred_valid)
    cv_scores_eclf.append(auc_score)

300:	test: 0.7264932	best: 0.7460781 (75)	total: 27.8s	remaining: 27.6s
599:	test: 0.7044074	best: 0.7460781 (75)	total: 47.3s	remaining: 0us
bestTest = 0.7460781336
bestIteration = 75
Training until validation scores don't improve for 60 rounds
[200]	valid_0's auc: 0.718568
Early stopping, best iteration is:
[211]	valid_0's auc: 0.720322
Training until validation scores don't improve for 60 rounds
[200]	valid_0's auc: 0.726071
Early stopping, best iteration is:
[153]	valid_0's auc: 0.731008


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5700632	best: 0.5700632 (0)	total: 72.7ms	remaining: 43.5s
300:	test: 0.7906042	best: 0.8068830 (45)	total: 19.8s	remaining: 19.6s
599:	test: 0.7835118	best: 0.8068830 (45)	total: 39.4s	remaining: 0us
bestTest = 0.8068830371
bestIteration = 45
Training until validation scores don't improve for 60 rounds
Early stopping, best iteration is:
[117]	valid_0's auc: 0.787733
Training until validation scores don't improve for 60 rounds
Early stopping, best iteration is:
[126]	valid_0's auc: 0.77848


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5700818	best: 0.5700818 (0)	total: 69.6ms	remaining: 41.7s
300:	test: 0.8288592	best: 0.8349068 (150)	total: 19.7s	remaining: 19.5s
599:	test: 0.8227294	best: 0.8349068 (150)	total: 39.6s	remaining: 0us
bestTest = 0.8349068165
bestIteration = 150
Training until validation scores don't improve for 60 rounds
[200]	valid_0's auc: 0.843828
Early stopping, best iteration is:
[156]	valid_0's auc: 0.850204
Training until validation scores don't improve for 60 rounds
[200]	valid_0's auc: 0.849027
Early stopping, best iteration is:
[232]	valid_0's auc: 0.851818


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.4743321	best: 0.4743321 (0)	total: 70ms	remaining: 41.9s
300:	test: 0.7414767	best: 0.7427342 (235)	total: 19.7s	remaining: 19.6s
599:	test: 0.7608149	best: 0.7609546 (590)	total: 39.5s	remaining: 0us
bestTest = 0.7609546185
bestIteration = 590
Training until validation scores don't improve for 60 rounds
Early stopping, best iteration is:
[93]	valid_0's auc: 0.789152
Training until validation scores don't improve for 60 rounds
Early stopping, best iteration is:
[94]	valid_0's auc: 0.77884


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.4868561	best: 0.4868561 (0)	total: 64.7ms	remaining: 38.7s
300:	test: 0.7861982	best: 0.7936170 (70)	total: 19.7s	remaining: 19.6s
599:	test: 0.8031915	best: 0.8031915 (599)	total: 39.6s	remaining: 0us
bestTest = 0.803191483
bestIteration = 599
Training until validation scores don't improve for 60 rounds
[200]	valid_0's auc: 0.824636
Early stopping, best iteration is:
[251]	valid_0's auc: 0.828639
Training until validation scores don't improve for 60 rounds
[200]	valid_0's auc: 0.823488
Early stopping, best iteration is:
[327]	valid_0's auc: 0.830767


In [8]:
print("CatBoost")   
print("CV AUC scores: ", cv_scores_cb)
print("Maximum CV AUC score: ", max(cv_scores_cb))
print("LightGBM")
print("CV AUC scores: ", cv_scores_lgb)
print("Maximum CV AUC score: ", max(cv_scores_lgb))
print("LightGBM_goss")
print("CV AUC scores: ", cv_scores_lgb2)
print("Maximum CV AUC score: ", max(cv_scores_lgb2))
print("Ensemble of LGBM and LGBM_goss")
print("CV AUC scores: ", cv_scores_eclf)
print("Maximum CV AUC score: ", max(cv_scores_eclf))

CatBoost
CV AUC scores:  [0.7044074182337848, 0.7835117700699112, 0.8227293872971567, 0.7608148893360162, 0.803191489361702]
Maximum CV AUC score:  0.8227293872971567
LightGBM
CV AUC scores:  [0.7203221929910033, 0.7877334594211219, 0.8502038694141149, 0.789151576123407, 0.8286394176931691]
Maximum CV AUC score:  0.8502038694141149
LightGBM_goss
CV AUC scores:  [0.73100782747085, 0.778479516363268, 0.8518184057138165, 0.7788397048960429, 0.8307670772676372]
Maximum CV AUC score:  0.8518184057138165
Ensemble of LGBM and LGBM_goss
CV AUC scores:  [0.7045048556302576, 0.7527778715930966, 0.8206222806009359, 0.7828638497652582, 0.8146976483762598]
Maximum CV AUC score:  0.8206222806009359


In [9]:
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

# 交差検証の各分割での検証データのインデックスを用いてメタ特徴量と対応するラベルを集める関数
def collect_predictions(fitted_models, X, cv, y,cat_cols):
    meta_features = []
    meta_labels = []
    X[cat_cols] = X[cat_cols].astype("category")
    for _, idx_valid in cv.split(X, y, groups=weeks):
        X_valid = X.iloc[idx_valid]
        X_valid[cat_cols] = X_valid[cat_cols].astype("category")
        preds = [model.predict_proba(X_valid)[:, 1] for model in fitted_models]
        meta_features.append(preds)
        meta_labels.append(y.iloc[idx_valid])
    return np.concatenate(meta_features, axis=1), np.concatenate(meta_labels)

# 各モデルからメタ特徴量を生成
meta_features_cb, meta_labels = collect_predictions(fitted_models_cb, X, cv, y,cat_cols)
meta_features_lgb, _ = collect_predictions(fitted_models_lgb, X, cv, y,cat_cols)
meta_features_lgb2, _ = collect_predictions(fitted_models_lgb2, X, cv, y,cat_cols)
meta_features_eclf, _ = collect_predictions(fitted_models_eclf, X, cv, y,cat_cols)

# 全てのメタ特徴量を結合
meta_features = np.hstack([meta_features_cb, meta_features_lgb, meta_features_lgb2, meta_features_eclf])
display(meta_features)
display(meta_labels)

# メタ特徴量のみを使用してメタモデルを訓練
X_meta_train, X_meta_valid, y_meta_train, y_meta_valid = train_test_split(
    meta_features, meta_labels, test_size=0.2, random_state=SEED)

# メタモデル（LightGBM）の訓練
meta_clf = LGBMClassifier(
    n_estimators=100,
    learning_rate=0.1,
    num_leaves=31,
    random_state=SEED
)
meta_clf.fit(X_meta_train, y_meta_train)

# メタモデルの評価
y_meta_pred = meta_clf.predict_proba(X_meta_valid)[:, 1]
meta_auc_score = roc_auc_score(y_meta_valid, y_meta_pred)
print(f'Meta model AUC: {meta_auc_score:.4f}')


array([[3.35632618e-02, 1.13002394e-02, 9.88102256e-02, ...,
        6.91147278e-05, 5.05896140e-05, 4.13471753e-05],
       [2.14529209e-02, 9.59534202e-03, 6.22929155e-02, ...,
        1.30947781e-04, 5.75250667e-04, 1.03609841e-04],
       [3.82794773e-02, 1.42616365e-02, 5.54670385e-02, ...,
        8.79105134e-05, 1.68466978e-04, 5.26650457e-05],
       [3.03608829e-02, 9.74713739e-03, 4.26370019e-02, ...,
        1.22819122e-04, 4.02924259e-05, 2.49400077e-04],
       [2.14414812e-02, 1.08690634e-02, 5.38447734e-02, ...,
        8.20588244e-05, 2.10653742e-04, 3.54340330e-05]])

array([0, 0, 0, ..., 0, 0, 0], dtype=uint8)

ValueError: Found input variables with inconsistent numbers of samples: [5, 5000]

In [None]:
from sklearn.base import BaseEstimator, RegressorMixin
class VotingModel(BaseEstimator, RegressorMixin):
    def __init__(self, estimators):
        super().__init__()
        self.estimators = estimators
        
    def fit(self, X, y=None):
        return self
    
    def predict(self, X):
        y_preds = [estimator.predict(X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)
    
    def predict_proba(self, X):
        
        y_preds = [estimator.predict_proba(X) for estimator in self.estimators[:5]]
        
        X[cat_cols] = X[cat_cols].astype("category")
        y_preds += [estimator.predict_proba(X) for estimator in self.estimators[5:]]
        
        return np.mean(y_preds, axis=0)

#model = VotingModel(fitted_models_cb+fitted_models_eclf)

# Submission

In [None]:
## import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

# テストデータの前処理
df_test = df_test.drop(columns=["week_num"])
df_test = df_test.set_index("case_id")

# テストデータに対するメタ特徴量を生成する関数
def generate_test_meta_features(fitted_models, X):
    meta_features = []
    for model in fitted_models:
        if hasattr(model, 'predict_proba'):
            y_pred = model.predict_proba(X)[:, 1]
        else:
            y_pred = model.predict(X)
        meta_features.append(y_pred)
    return np.column_stack(meta_features)

# テストデータでの予測
test_meta_features_cb = generate_test_meta_features(fitted_models_cb, df_test)
test_meta_features_lgb = generate_test_meta_features(fitted_models_lgb, df_test)
test_meta_features_lgb2 = generate_test_meta_features(fitted_models_lgb2, df_test)
test_meta_features_eclf = generate_test_meta_features(fitted_models_eclf, df_test)

# 全てのテストメタ特徴量を結合
test_meta_features = np.hstack([test_meta_features_cb, test_meta_features_lgb, test_meta_features_lfgb2, test_meta_features_eclf])

# メタモデルを用いてテストデータのスコアを予測
y_pred_test = meta_clf.predict_proba(test_meta_features)[:, 1]

# サブミッションファイルの準備
df_subm = pd.read_csv(ROOT / "sample_submission.csv")
df_subm = df_subm.set_index("case_id")
df_subm["score"] = y_pred_test

# サブミッションファイルの出力
df_subm.to_csv("submission.csv")

# サブミッションファイルを表示
df_subm
