In [72]:
# %% [code] {"execution":{"iopub.status.busy":"2024-05-24T08:32:28.899510Z","iopub.execute_input":"2024-05-24T08:32:28.900611Z","iopub.status.idle":"2024-05-24T08:32:34.168765Z","shell.execute_reply.started":"2024-05-24T08:32:28.900567Z","shell.execute_reply":"2024-05-24T08:32:34.167481Z"},"jupyter":{"outputs_hidden":false}}
import gc
import lightgbm as lgb  # type: ignore
import numpy as np  # type: ignore
import pandas as pd  # type: ignore
import polars as pl  # type: ignore
import warnings
import os
import shutil

from catboost import CatBoostClassifier, Pool  # type: ignore
from glob import glob
import joblib
from IPython.display import display  # type: ignore
from pathlib import Path
from sklearn.base import BaseEstimator, ClassifierMixin  # type: ignore
from sklearn.metrics import roc_auc_score  # type: ignore
from sklearn.model_selection import StratifiedGroupKFold  # type: ignore
from typing import Any
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier

warnings.filterwarnings("ignore")
ROOT = Path('/Users/mira/kaggle_HomeCredit')

#ROOT = Path("/kaggle/input/home-credit-credit-risk-model-stability")
TRAIN_DIR = ROOT / "parquet_files" / "train"
TEST_DIR = ROOT / "parquet_files" / "test"

# %% [code] {"execution":{"iopub.status.busy":"2024-05-24T08:32:34.171165Z","iopub.execute_input":"2024-05-24T08:32:34.171864Z","iopub.status.idle":"2024-05-24T08:32:34.338974Z","shell.execute_reply.started":"2024-05-24T08:32:34.171821Z","shell.execute_reply":"2024-05-24T08:32:34.337456Z"},"jupyter":{"outputs_hidden":false}}
class Utility:
    @staticmethod
    def get_feat_defs(ending_with: str) -> None:
        """
        Retrieves feature definitions from a CSV file based on the specified ending.

        Args:
        - ending_with (str): Ending to filter feature definitions.

        Returns:
        - pl.DataFrame: Filtered feature definitions.
        """
        feat_defs: pl.DataFrame = pl.read_csv(ROOT / "feature_definitions.csv")

        filtered_feats: pl.DataFrame = feat_defs.filter(
            pl.col("Variable").apply(lambda var: var.endswith(ending_with))
        )

        with pl.Config(fmt_str_lengths=200, tbl_rows=-1):
            print(filtered_feats)

        filtered_feats = None
        feat_defs = None

    @staticmethod
    def find_index(lst: list[Any], item: Any) -> int | None:
        """
        Finds the index of an item in a list.

        Args:
        - lst (list): List to search.
        - item (Any): Item to find in the list.

        Returns:
        - int | None: Index of the item if found, otherwise None.
        """
        try:
            return lst.index(item)
        except ValueError:
            return None

    @staticmethod
    def dtype_to_str(dtype: pl.DataType) -> str:
        """
        Converts Polars data type to string representation.

        Args:
        - dtype (pl.DataType): Polars data type.

        Returns:
        - str: String representation of the data type.
        """
        dtype_map = {
            pl.Decimal: "Decimal",
            pl.Float32: "Float32",
            pl.Float64: "Float64",
            pl.UInt8: "UInt8",
            pl.UInt16: "UInt16",
            pl.UInt32: "UInt32",
            pl.UInt64: "UInt64",
            pl.Int8: "Int8",
            pl.Int16: "Int16",
            pl.Int32: "Int32",
            pl.Int64: "Int64",
            pl.Date: "Date",
            pl.Datetime: "Datetime",
            pl.Duration: "Duration",
            pl.Time: "Time",
            pl.Array: "Array",
            pl.List: "List",
            pl.Struct: "Struct",
            pl.String: "String",
            pl.Categorical: "Categorical",
            pl.Enum: "Enum",
            pl.Utf8: "Utf8",
            pl.Binary: "Binary",
            pl.Boolean: "Boolean",
            pl.Null: "Null",
            pl.Object: "Object",
            pl.Unknown: "Unknown",
        }

        return dtype_map.get(dtype)

    @staticmethod
    def find_feat_occur(regex_path: str, ending_with: str) -> pl.DataFrame:
        """
        Finds occurrences of features ending with a specific string in Parquet files.

        Args:
        - regex_path (str): Regular expression to match Parquet file paths.
        - ending_with (str): Ending to filter feature names.

        Returns:
        - pl.DataFrame: DataFrame containing feature definitions, data types, and file locations.
        """
        feat_defs: pl.DataFrame = pl.read_csv(ROOT / "feature_definitions.csv").filter(
            pl.col("Variable").apply(lambda var: var.endswith(ending_with))
        )
        feat_defs.sort(by=["Variable"])

        feats: list[pl.String] = feat_defs["Variable"].to_list()
        feats.sort()

        occurrences: list[list] = [[set(), set()] for _ in range(feat_defs.height)]

        for path in glob(str(regex_path)):
            df_schema: dict = pl.read_parquet_schema(path)

            for feat, dtype in df_schema.items():
                index: int = Utility.find_index(feats, feat)
                if index != None:
                    occurrences[index][0].add(Utility.dtype_to_str(dtype))
                    occurrences[index][1].add(Path(path).stem)

        data_types: list[str] = [None] * feat_defs.height
        file_locs: list[str] = [None] * feat_defs.height

        for i, feat in enumerate(feats):
            data_types[i] = list(occurrences[i][0])
            file_locs[i] = list(occurrences[i][1])

        feat_defs = feat_defs.with_columns(pl.Series(data_types).alias("Data_Type(s)"))
        feat_defs = feat_defs.with_columns(pl.Series(file_locs).alias("File_Loc(s)"))

        return feat_defs

    def reduce_memory_usage(df: pl.DataFrame, name) -> pl.DataFrame:
        """
        Reduces memory usage of a DataFrame by converting column types.

        Args:
        - df (pl.DataFrame): DataFrame to optimize.
        - name (str): Name of the DataFrame.

        Returns:
        - pl.DataFrame: Optimized DataFrame.
        """
        print(
            f"Memory usage of dataframe \"{name}\" is {round(df.estimated_size('mb'), 4)} MB."
        )

        int_types = [
            pl.Int8,
            pl.Int16,
            pl.Int32,
            pl.Int64,
            pl.UInt8,
            pl.UInt16,
            pl.UInt32,
            pl.UInt64,
        ]
        float_types = [pl.Float32, pl.Float64]

        for col in df.columns:
            col_type = df[col].dtype
            if col_type in int_types + float_types:
                c_min = df[col].min()
                c_max = df[col].max()

                if c_min is not None and c_max is not None:
                    if col_type in int_types:
                        if c_min >= 0:
                            if (
                                c_min >= np.iinfo(np.uint8).min
                                and c_max <= np.iinfo(np.uint8).max
                            ):
                                df = df.with_columns(df[col].cast(pl.UInt8))
                            elif (
                                c_min >= np.iinfo(np.uint16).min
                                and c_max <= np.iinfo(np.uint16).max
                            ):
                                df = df.with_columns(df[col].cast(pl.UInt16))
                            elif (
                                c_min >= np.iinfo(np.uint32).min
                                and c_max <= np.iinfo(np.uint32).max
                            ):
                                df = df.with_columns(df[col].cast(pl.UInt32))
                            elif (
                                c_min >= np.iinfo(np.uint64).min
                                and c_max <= np.iinfo(np.uint64).max
                            ):
                                df = df.with_columns(df[col].cast(pl.UInt64))
                        else:
                            if (
                                c_min >= np.iinfo(np.int8).min
                                and c_max <= np.iinfo(np.int8).max
                            ):
                                df = df.with_columns(df[col].cast(pl.Int8))
                            elif (
                                c_min >= np.iinfo(np.int16).min
                                and c_max <= np.iinfo(np.int16).max
                            ):
                                df = df.with_columns(df[col].cast(pl.Int16))
                            elif (
                                c_min >= np.iinfo(np.int32).min
                                and c_max <= np.iinfo(np.int32).max
                            ):
                                df = df.with_columns(df[col].cast(pl.Int32))
                            elif (
                                c_min >= np.iinfo(np.int64).min
                                and c_max <= np.iinfo(np.int64).max
                            ):
                                df = df.with_columns(df[col].cast(pl.Int64))
                    elif col_type in float_types:
                        if (
                            c_min > np.finfo(np.float32).min
                            and c_max < np.finfo(np.float32).max
                        ):
                            df = df.with_columns(df[col].cast(pl.Float32))

        print(
            f"Memory usage of dataframe \"{name}\" became {round(df.estimated_size('mb'), 4)} MB."
        )

        return df

    def to_pandas(df: pl.DataFrame, cat_cols: list[str] = None) -> (pd.DataFrame, list[str]):  # type: ignore
        """
        Converts a Polars DataFrame to a Pandas DataFrame.

        Args:
        - df (pl.DataFrame): Polars DataFrame to convert.
        - cat_cols (list[str]): List of categorical columns. Default is None.

        Returns:
        - (pd.DataFrame, list[str]): Tuple containing the converted Pandas DataFrame and categorical columns.
        """
        df: pd.DataFrame = df.to_pandas()

        if cat_cols is None:
            cat_cols = list(df.select_dtypes("object").columns)

        df[cat_cols] = df[cat_cols].astype("str")

        return df, cat_cols

# %% [code] {"execution":{"iopub.status.busy":"2024-05-24T08:32:34.340577Z","iopub.execute_input":"2024-05-24T08:32:34.341140Z","iopub.status.idle":"2024-05-24T08:32:34.363081Z","shell.execute_reply.started":"2024-05-24T08:32:34.341079Z","shell.execute_reply":"2024-05-24T08:32:34.361494Z"},"jupyter":{"outputs_hidden":false}}
# feat_defs:pl.DataFrame = Utility.find_feat_occur(TRAIN_DIR / "train_*.parquet", "P")
# feat_defs:pl.DataFrame = Utility.find_feat_occur(TRAIN_DIR / "train_*.parquet", "M")
# feat_defs:pl.DataFrame = Utility.find_feat_occur(TRAIN_DIR / "train_*.parquet", "A")
# feat_defs:pl.DataFrame = Utility.find_feat_occur(TRAIN_DIR / "train_*.parquet", "D")
# feat_defs:pl.DataFrame = Utility.find_feat_occur(TRAIN_DIR / "train_*.parquet", "T")
# feat_defs:pl.DataFrame = Utility.find_feat_occur(TRAIN_DIR / "train_*.parquet", "L")
# feat_defs:pl.DataFrame = pl.read_csv(ROOT / "feature_definitions.csv")
# with pl.Config(fmt_str_lengths=1000, tbl_rows=-1, tbl_width_chars=180):
#     print(feat_defs)

# %% [code] {"execution":{"iopub.status.busy":"2024-05-24T08:32:34.366890Z","iopub.execute_input":"2024-05-24T08:32:34.367697Z","iopub.status.idle":"2024-05-24T08:32:34.390519Z","shell.execute_reply.started":"2024-05-24T08:32:34.367637Z","shell.execute_reply":"2024-05-24T08:32:34.389144Z"},"jupyter":{"outputs_hidden":false}}
class Aggregator:
    @staticmethod
    def max_expr(df: pl.LazyFrame) -> list[pl.Series]:
        """
        Generates expressions for calculating maximum values for specific columns.

        Args:
        - df (pl.LazyFrame): Input LazyFrame.

        Returns:
        - list[pl.Series]: List of expressions for maximum values.
        """
        cols: list[str] = [
            col
            for col in df.columns
            if (col[-1] in ("P", "M", "A", "D", "T", "L")) or ("num_group" in col)
        ]

        expr_max: list[pl.Series] = [
            pl.col(col).max().alias(f"max_{col}") for col in cols
        ]

        return expr_max

    @staticmethod
    def min_expr(df: pl.LazyFrame) -> list[pl.Series]:
        """
        Generates expressions for calculating minimum values for specific columns.

        Args:
        - df (pl.LazyFrame): Input LazyFrame.

        Returns:
        - list[pl.Series]: List of expressions for minimum values.
        """
        cols: list[str] = [
            col
            for col in df.columns
            if (col[-1] in ("P", "M", "A", "D", "T", "L")) or ("num_group" in col)
        ]

        expr_min: list[pl.Series] = [
            pl.col(col).min().alias(f"min_{col}") for col in cols
        ]

        return expr_min

    @staticmethod
    def mean_expr(df: pl.LazyFrame) -> list[pl.Series]:
        """
        Generates expressions for calculating mean values for specific columns.

        Args:
        - df (pl.LazyFrame): Input LazyFrame.

        Returns:
        - list[pl.Series]: List of expressions for mean values.
        """
        cols: list[str] = [col for col in df.columns if col.endswith(("P", "A", "D"))]

        expr_mean: list[pl.Series] = [
            pl.col(col).mean().alias(f"mean_{col}") for col in cols
        ]

        return expr_mean

    @staticmethod
    def var_expr(df: pl.LazyFrame) -> list[pl.Series]:
        """
        Generates expressions for calculating variance for specific columns.

        Args:
        - df (pl.LazyFrame): Input LazyFrame.

        Returns:
        - list[pl.Series]: List of expressions for variance.
        """
        cols: list[str] = [col for col in df.columns if col.endswith(("P", "A", "D"))]

        expr_mean: list[pl.Series] = [
            pl.col(col).var().alias(f"var_{col}") for col in cols
        ]

        return expr_mean

    @staticmethod
    def mode_expr(df: pl.LazyFrame) -> list[pl.Series]:
        """
        Generates expressions for calculating mode values for specific columns.

        Args:
        - df (pl.LazyFrame): Input LazyFrame.

        Returns:
        - list[pl.Series]: List of expressions for mode values.
        """
        cols: list[str] = [col for col in df.columns if col.endswith("M")]

        expr_mode: list[pl.Series] = [
            pl.col(col).drop_nulls().mode().first().alias(f"mode_{col}") for col in cols
        ]

        return expr_mode

    @staticmethod
    def get_exprs(df: pl.LazyFrame) -> list[pl.Series]:
        """
        Combines expressions for maximum, mean, and variance calculations.

        Args:
        - df (pl.LazyFrame): Input LazyFrame.

        Returns:
        - list[pl.Series]: List of combined expressions.
        """
        exprs = (
            Aggregator.max_expr(df) + Aggregator.mean_expr(df) + Aggregator.var_expr(df)
        )

        return exprs

# %% [code] {"execution":{"iopub.status.busy":"2024-05-24T08:32:34.392482Z","iopub.execute_input":"2024-05-24T08:32:34.392917Z","iopub.status.idle":"2024-05-24T08:32:34.410491Z","shell.execute_reply.started":"2024-05-24T08:32:34.392884Z","shell.execute_reply":"2024-05-24T08:32:34.409025Z"},"jupyter":{"outputs_hidden":false}}
class SchemaGen:
    @staticmethod
    def change_dtypes(df: pl.LazyFrame) -> pl.LazyFrame:
        """
        Changes the data types of columns in the DataFrame.

        Args:
        - df (pl.LazyFrame): Input LazyFrame.

        Returns:
        - pl.LazyFrame: LazyFrame with modified data types.
        """
        for col in df.columns:
            if col == "case_id":
                df = df.with_columns(pl.col(col).cast(pl.UInt32).alias(col))
            elif col in ["WEEK_NUM", "num_group1", "num_group2"]:
                df = df.with_columns(pl.col(col).cast(pl.UInt16).alias(col))
            elif col == "date_decision" or col[-1] == "D":
                df = df.with_columns(pl.col(col).cast(pl.Date).alias(col))
            elif col[-1] in ["P", "A"]:
                df = df.with_columns(pl.col(col).cast(pl.Float64).alias(col))
            elif col[-1] in ("M",):
                df = df.with_columns(pl.col(col).cast(pl.String))
        return df

    @staticmethod
    def scan_files(glob_path: str, depth: int = None):
        chunks = []
        for path in glob(str(glob_path)):
            df = pl.read_parquet(path, low_memory=True, rechunk=True)
            df = df.pipe(SchemaGen.change_dtypes)
            if depth in [1, 2]:
                df = df.group_by("case_id").agg(Aggregator.get_exprs(df))
            chunks.append(df)
        df = pl.concat(chunks, how="vertical_relaxed")
        del chunks
        gc.collect()
        df = df.unique(subset=["case_id"])
        return df

    @staticmethod
    def join_dataframes(df_base, depth_0, depth_1, depth_2):
        for i, df in enumerate(depth_0 + depth_1 + depth_2):
            df_base = df_base.join(df, how="left", on="case_id", suffix=f"_{i}")
        return df_base

# %% [code] {"execution":{"iopub.status.busy":"2024-05-24T08:32:34.412300Z","iopub.execute_input":"2024-05-24T08:32:34.412825Z","iopub.status.idle":"2024-05-24T08:32:34.435625Z","shell.execute_reply.started":"2024-05-24T08:32:34.412785Z","shell.execute_reply":"2024-05-24T08:32:34.434131Z"},"jupyter":{"outputs_hidden":false}}
def filter_cols(df: pl.DataFrame) -> pl.DataFrame:
    """
    Filters columns in the DataFrame based on null percentage and unique values for string columns.

    Args:
    - df (pl.DataFrame): Input DataFrame.

    Returns:
    - pl.DataFrame: DataFrame with filtered columns.
    """
    for col in df.columns:
        if col not in ["case_id", "year", "month", "week_num", "target"]:
            null_pct = df[col].is_null().mean()

            if null_pct > 0.95:
                df = df.drop(col)

    for col in df.columns:
        if (col not in ["case_id", "year", "month", "week_num", "target"]) & (
            df[col].dtype == pl.String
        ):
            freq = df[col].n_unique()

            if (freq > 200) | (freq == 1):
                df = df.drop(col)

    return df


def transform_cols(df: pl.DataFrame) -> pl.DataFrame:
    """
    Transforms columns in the DataFrame according to predefined rules.

    Args:
    - df (pl.DataFrame): Input DataFrame.

    Returns:
    - pl.DataFrame: DataFrame with transformed columns.
    """
    if "riskassesment_302T" in df.columns:
        if df["riskassesment_302T"].dtype == pl.Null:
            df = df.with_columns(
                [
                    pl.Series(
                        "riskassesment_302T_rng", df["riskassesment_302T"], pl.UInt8
                    ),
                    pl.Series(
                        "riskassesment_302T_mean", df["riskassesment_302T"], pl.UInt8
                    ),
                ]
            )
        else:
            pct_low: pl.Series = (
                df["riskassesment_302T"]
                .str.split(" - ")
                .apply(lambda x: x[0].replace("%", ""))
                .cast(pl.UInt8)
            )
            pct_high: pl.Series = (
                df["riskassesment_302T"]
                .str.split(" - ")
                .apply(lambda x: x[1].replace("%", ""))
                .cast(pl.UInt8)
            )

            diff: pl.Series = pct_high - pct_low
            avg: pl.Series = ((pct_low + pct_high) / 2).cast(pl.Float32)

            del pct_high, pct_low
            gc.collect()

            df = df.with_columns(
                [
                    diff.alias("riskassesment_302T_rng"),
                    avg.alias("riskassesment_302T_mean"),
                ]
            )

        df.drop("riskassesment_302T")

    return df


def handle_dates(df: pl.DataFrame) -> pl.DataFrame:
    """
    Handles date columns in the DataFrame.

    Args:
    - df (pl.DataFrame): Input DataFrame.

    Returns:
    - pl.DataFrame: DataFrame with transformed date columns.
    """
    for col in df.columns:
        if col.endswith("D"):
            df = df.with_columns(pl.col(col) - pl.col("date_decision"))
            df = df.with_columns(pl.col(col).dt.total_days().cast(pl.Int32))

    df = df.rename(
        {
            "MONTH": "month",
            "WEEK_NUM": "week_num"
        }
    )
            
    df = df.with_columns(
        [
            pl.col("date_decision").dt.year().alias("year").cast(pl.Int16),
            pl.col("date_decision").dt.day().alias("day").cast(pl.UInt8),
        ]
    )

    return df.drop("date_decision")

# %% [code] {"execution":{"iopub.status.busy":"2024-05-24T08:32:34.437396Z","iopub.execute_input":"2024-05-24T08:32:34.437811Z","iopub.status.idle":"2024-05-24T08:35:54.241472Z","shell.execute_reply.started":"2024-05-24T08:32:34.437775Z","shell.execute_reply":"2024-05-24T08:35:54.240264Z"},"jupyter":{"outputs_hidden":false}}
data_store: dict = {
    "df_base": SchemaGen.scan_files(TRAIN_DIR / "train_base.parquet"),
    "depth_0": [
        SchemaGen.scan_files(TRAIN_DIR / "train_static_cb_0.parquet"),
        SchemaGen.scan_files(TRAIN_DIR / "train_static_0_*.parquet"),
    ],
    "depth_1": [
        SchemaGen.scan_files(TRAIN_DIR / "train_applprev_1_*.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_tax_registry_a_1.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_tax_registry_b_1.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_tax_registry_c_1.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_credit_bureau_a_1_*.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_credit_bureau_b_1.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_other_1.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_person_1.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_deposit_1.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_debitcard_1.parquet", 1),
    ],
    "depth_2": [
        SchemaGen.scan_files(TRAIN_DIR / "train_credit_bureau_a_2_*.parquet", 2),
        SchemaGen.scan_files(TRAIN_DIR / "train_credit_bureau_b_2.parquet", 2),
    ],
}

df_train: pl.DataFrame = (
    SchemaGen.join_dataframes(**data_store)
    .pipe(filter_cols)
    .pipe(transform_cols)
    .pipe(handle_dates)
    .pipe(Utility.reduce_memory_usage, "df_train")
)

del data_store
gc.collect()

print(f"Train data shape: {df_train.shape}")
display(df_train.head(10))

# %% [code] {"execution":{"iopub.status.busy":"2024-05-24T08:35:54.242893Z","iopub.execute_input":"2024-05-24T08:35:54.243287Z","iopub.status.idle":"2024-05-24T08:35:56.559687Z","shell.execute_reply.started":"2024-05-24T08:35:54.243256Z","shell.execute_reply":"2024-05-24T08:35:56.558389Z"},"jupyter":{"outputs_hidden":false}}
data_store: dict = {
    "df_base": SchemaGen.scan_files(TEST_DIR / "test_base.parquet"),
    "depth_0": [
        SchemaGen.scan_files(TEST_DIR / "test_static_cb_0.parquet"),
        SchemaGen.scan_files(TEST_DIR / "test_static_0_*.parquet"),
    ],
    "depth_1": [
        SchemaGen.scan_files(TEST_DIR / "test_applprev_1_*.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_tax_registry_a_1.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_tax_registry_b_1.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_tax_registry_c_1.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_credit_bureau_a_1_*.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_credit_bureau_b_1.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_other_1.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_person_1.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_deposit_1.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_debitcard_1.parquet", 1),
    ],
    "depth_2": [
        SchemaGen.scan_files(TEST_DIR / "test_credit_bureau_a_2_*.parquet", 2),
        SchemaGen.scan_files(TEST_DIR / "test_credit_bureau_b_2.parquet", 2),
    ],
}

df_test: pl.DataFrame = (
    SchemaGen.join_dataframes(**data_store)
    .pipe(transform_cols)
    .pipe(handle_dates)
    .select([col for col in df_train.columns if col != "target"])
    .pipe(Utility.reduce_memory_usage, "df_test")
)

del data_store
gc.collect()

print(f"Test data shape: {df_test.shape}")

# %% [code] {"execution":{"iopub.status.busy":"2024-05-24T08:35:56.561210Z","iopub.execute_input":"2024-05-24T08:35:56.561654Z","iopub.status.idle":"2024-05-24T08:36:22.870809Z","shell.execute_reply.started":"2024-05-24T08:35:56.561613Z","shell.execute_reply":"2024-05-24T08:36:22.869478Z"},"jupyter":{"outputs_hidden":false}}
df_train, cat_cols = Utility.to_pandas(df_train)
df_test, cat_cols = Utility.to_pandas(df_test, cat_cols)

# %% [code] {"execution":{"iopub.status.busy":"2024-05-24T08:36:22.875063Z","iopub.execute_input":"2024-05-24T08:36:22.875585Z","iopub.status.idle":"2024-05-24T08:36:22.886734Z","shell.execute_reply.started":"2024-05-24T08:36:22.875542Z","shell.execute_reply":"2024-05-24T08:36:22.885190Z"},"jupyter":{"outputs_hidden":false}}
class VotingModel(BaseEstimator, ClassifierMixin):
    """
    A voting ensemble model that combines predictions from multiple estimators.

    Parameters:
    - estimators (list): List of base estimators.

    Attributes:
    - estimators (list): List of base estimators.

    Methods:
    - fit(X, y=None): Fit the model to the training data.
    - predict(X): Predict class labels for samples.
    - predict_proba(X): Predict class probabilities for samples.
    """

    def __init__(self, estimators: list[BaseEstimator]):
        """
        Initialize the VotingModel with a list of base estimators.

        Args:
        - estimators (list): List of base estimators.
        """
        super().__init__()
        self.estimators = estimators

    def fit(self, X, y=None):
        """
        Fit the model to the training data.

        Args:
        - X: Input features.
        - y: Target labels (ignored).

        Returns:
        - self: Returns the instance itself.
        """
        return self

    def predict(self, X):
        """
        Predict class labels for samples.

        Args:
        - X: Input features.

        Returns:
        - numpy.ndarray: Predicted class labels.
        """
        y_preds = [estimator.predict(X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)

    def predict_proba(self, X):
        """
        Predict class probabilities for samples.

        Args:
        - X: Input features.

        Returns:
        - numpy.ndarray: Predicted class probabilities.
        """
        y_preds = [estimator.predict_proba(X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)

Memory usage of dataframe "df_train" is 5696.4989 MB.
Memory usage of dataframe "df_train" became 3650.9096 MB.
Train data shape: (1526659, 472)


case_id,month,week_num,target,assignmentdate_238D,assignmentdate_4527235D,birthdate_574D,contractssum_5085716L,dateofbirth_337D,days120_123L,days180_256L,days30_165L,days360_512L,days90_310L,description_5085714M,education_1103M,education_88M,firstquarter_103L,fourthquarter_440L,maritalst_385M,maritalst_893M,numberofqueries_373L,pmtaverage_3A,pmtaverage_4527227A,pmtcount_4527229L,pmtcount_693L,pmtscount_423L,pmtssum_45A,requesttype_4525192L,responsedate_1012D,responsedate_4527233D,responsedate_4917613D,secondquarter_766L,thirdquarter_1082L,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,…,mean_mainoccupationinc_384A,max_amount_416A,max_num_group1_10,max_openingdate_313D,mean_amount_416A,mean_openingdate_313D,max_num_group1_11,max_openingdate_857D,mean_openingdate_857D,max_collater_typofvalofguarant_298M,max_collater_typofvalofguarant_407M,max_collater_valueofguarantee_1124L,max_collater_valueofguarantee_876L,max_collaterals_typeofguarante_359M,max_collaterals_typeofguarante_669M,max_num_group1_12,max_num_group2,max_pmts_dpd_1073P,max_pmts_dpd_303P,max_pmts_month_158T,max_pmts_month_706T,max_pmts_overdue_1140A,max_pmts_overdue_1152A,max_pmts_year_1139T,max_pmts_year_507T,max_subjectroles_name_541M,max_subjectroles_name_838M,mean_pmts_dpd_1073P,mean_pmts_dpd_303P,mean_pmts_overdue_1140A,mean_pmts_overdue_1152A,var_pmts_dpd_1073P,var_pmts_dpd_303P,var_pmts_overdue_1140A,var_pmts_overdue_1152A,year,day
u32,u32,u8,u8,i16,u8,i16,f32,i32,f32,f32,f32,f32,f32,str,str,str,f32,f32,str,str,f32,f32,f32,f32,f32,f32,f32,str,i8,u8,i8,f32,f32,f32,f32,f32,…,f32,f32,u8,i16,f32,i16,u8,i16,i16,str,str,f32,f32,str,str,u16,u8,f32,f32,f32,f32,f32,f32,f32,f32,str,str,f32,f32,f32,f32,f32,f32,f32,f32,u16,u8
35287,201910,42,1,,,,,-22332.0,0.0,0.0,0.0,2.0,0.0,"""a55475b1""","""a55475b1""","""a55475b1""",2.0,0.0,"""a55475b1""","""a55475b1""",2.0,,,,,,,"""DEDUCTION_6""",,14.0,,0.0,0.0,,,5406.600098,…,38000.0,,,,,,,,,"""a55475b1""","""a55475b1""",0.0,0.0,"""c7a5ad39""","""c7a5ad39""",1.0,35.0,1811.0,1387.0,12.0,12.0,46036.398438,48303.097656,2020.0,2020.0,"""ab3c25cf""","""ab3c25cf""",1460.916626,1053.888916,39519.25,46514.085938,129781.554688,85005.179688,205083184.0,86414392.0,2019,23
918222,201912,51,0,,,,,-9281.0,0.0,0.0,0.0,0.0,0.0,"""a55475b1""","""a55475b1""","""a55475b1""",0.0,0.0,"""a55475b1""","""a55475b1""",0.0,,,,,,,"""DEDUCTION_6""",,14.0,,0.0,0.0,,,2368.0,…,38000.0,,,,,,,,,"""a55475b1""","""a55475b1""",0.0,,"""a55475b1""","""c7a5ad39""",0.0,23.0,3.0,,12.0,,918.600037,,2020.0,,"""a55475b1""","""ab3c25cf""",0.294118,,159.276001,,0.595588,,125868.195312,,2019,29
1596570,201910,42,0,,,,,-18559.0,0.0,1.0,0.0,1.0,0.0,"""a55475b1""","""a55475b1""","""a55475b1""",0.0,0.0,"""a55475b1""","""a55475b1""",1.0,,,,,,,"""DEDUCTION_6""",,14.0,,1.0,0.0,0.0,42233.039062,2221.0,…,94000.0,,,,,,,,,"""a55475b1""","""a55475b1""",,0.0,"""c7a5ad39""","""a55475b1""",2.0,23.0,,0.0,,12.0,,0.0,,2020.0,"""ab3c25cf""","""a55475b1""",,0.0,,0.0,,0.0,,0.0,2019,25
1839585,202005,71,0,,,,,-22232.0,1.0,2.0,1.0,3.0,1.0,"""a55475b1""","""717ddd49""","""a55475b1""",0.0,2.0,"""3439d993""","""a55475b1""",3.0,,,,,,,,,,14.0,1.0,0.0,0.0,0.0,5703.0,…,62000.0,,,,,,,,,"""a55475b1""","""a55475b1""",0.0,100000.0,"""c7a5ad39""","""c7a5ad39""",6.0,35.0,0.0,899.0,12.0,12.0,0.0,54882.414062,2021.0,2021.0,"""ab3c25cf""","""ab3c25cf""",0.0,56.556702,0.0,10648.56543,0.0,40234.082031,0.0,451848608.0,2020,13
936089,202001,53,0,,,,,,,,,,,"""a55475b1""","""a55475b1""","""a55475b1""",,,"""a55475b1""","""a55475b1""",,,,,,,,"""DEDUCTION_6""",,14.0,,,,,,4607.800293,…,50000.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2020,13
106617,201901,3,0,,,-12196.0,,,,,,,,"""a55475b1""","""a55475b1""","""a55475b1""",,,"""a55475b1""","""a55475b1""",,,,,,9.0,9283.0,,14.0,,,,,0.0,,1218.0,…,36000.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2019,22
813398,201909,38,0,,,,,-13540.0,0.0,0.0,0.0,2.0,0.0,"""a55475b1""","""a55475b1""","""a55475b1""",2.0,5.0,"""a7fcb6e5""","""1a19667c""",2.0,,,,,4.0,1614.800049,"""DEDUCTION_6""",14.0,14.0,,2.0,0.0,,,3266.600098,…,80000.0,,,,,,,,,"""a55475b1""","""a55475b1""",0.0,73896.0,"""c7a5ad39""","""c7a5ad39""",1.0,35.0,0.0,143.0,12.0,12.0,0.0,8731.600586,2020.0,2010.0,"""ab3c25cf""","""ab3c25cf""",0.0,5.266667,0.0,1153.253418,0.0,676.960938,0.0,5216001.5,2019,27
1788891,202002,58,0,,,,,-9665.0,0.0,2.0,0.0,2.0,0.0,"""a55475b1""","""a55475b1""","""a55475b1""",0.0,2.0,"""a55475b1""","""a55475b1""",2.0,,,,,,,"""DEDUCTION_6""",,14.0,,0.0,0.0,0.0,21369.201172,1830.200073,…,60000.0,,,,,,,,,"""a55475b1""","""a55475b1""",0.0,0.0,"""c7a5ad39""","""c7a5ad39""",0.0,35.0,0.0,0.0,12.0,12.0,0.0,0.0,2021.0,2020.0,"""ab3c25cf""","""ab3c25cf""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2020,17
1789900,202002,59,0,,,,,-14903.0,1.0,1.0,1.0,10.0,1.0,"""a55475b1""","""6b2ae0fa""","""a55475b1""",17.0,32.0,"""a7fcb6e5""","""a55475b1""",10.0,,,,,,,"""DEDUCTION_6""",,14.0,,27.0,35.0,0.0,0.0,2514.400146,…,40000.0,,,,,,,,,"""a55475b1""","""a55475b1""",0.0,91000.0,"""c7a5ad39""","""c7a5ad39""",61.0,35.0,402.0,22.0,12.0,12.0,62640.0,7280.0,2021.0,2020.0,"""ab3c25cf""","""ab3c25cf""",118.573334,0.978947,16664.984375,553.218445,16738.707031,12.327597,579648320.0,3051685.5,2020,18
179474,201911,46,0,,14.0,,,-25561.0,0.0,0.0,0.0,2.0,0.0,"""a55475b1""","""a55475b1""","""a55475b1""",6.0,1.0,"""a55475b1""","""a55475b1""",2.0,,15552.200195,6.0,,,,"""PENSION_6""",,14.0,,1.0,1.0,0.0,316504.34375,8332.0,…,64000.0,,,,,,,,,"""a55475b1""","""a55475b1""",0.0,0.0,"""c7a5ad39""","""c7a5ad39""",6.0,35.0,1.0,32.0,12.0,12.0,563.799988,6021.200195,2020.0,2020.0,"""ab3c25cf""","""ab3c25cf""",0.045455,0.660714,25.627274,215.035721,0.045455,13.973938,14448.65625,1251033.5,2019,25


Memory usage of dataframe "df_test" is 0.0369 MB.
Memory usage of dataframe "df_test" became 0.0244 MB.
Test data shape: (10, 471)


In [73]:
# Set a seed for various non-deterministic processes for reproducibility
import random
def seed_it_all(seed=7):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)

SEED = 0

# set the seed for this run
seed_it_all(SEED)

In [74]:
df_train[cat_cols] = df_train[cat_cols].astype(str)
df_test[cat_cols] = df_test[cat_cols].astype(str)

In [75]:
sample = pd.read_csv(ROOT / "sample_submission.csv")
device='gpu'
est_cnt=6000
DRY_RUN = True if sample.shape[0] == 10 else False   
if DRY_RUN:
    device='cpu'
    df_train = df_train.iloc[:5000]
    est_cnt=600

print(device)

# %% [code] {"execution":{"iopub.status.busy":"2024-05-24T08:36:22.920891Z","iopub.execute_input":"2024-05-24T08:36:22.921381Z","iopub.status.idle":"2024-05-24T08:36:24.421036Z","shell.execute_reply.started":"2024-05-24T08:36:22.921340Z","shell.execute_reply":"2024-05-24T08:36:24.419720Z"},"jupyter":{"outputs_hidden":false}}
X = df_train.drop(columns=["target", "case_id", "week_num"])
y = df_train["target"]

weeks = df_train["week_num"]

del df_train
gc.collect()

cv = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=SEED)

cpu


### Feature Selection

In [76]:
params_lgb = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "max_depth": 10,  
    "learning_rate": 0.05,
    "n_estimators": 250,  
    "colsample_bytree": 0.8,
    "colsample_bynode": 0.8,
    "verbose": -1,
    "random_state": SEED,
    "reg_alpha": 0.1,
    "reg_lambda": 10,
    "extra_trees":True,
    'num_leaves':64,
    "device": device, 
}

In [77]:
params_lgb2 = {
    "boosting_type": "goss",
    "objective": "binary",
    "metric": "auc",
    "max_depth": 10,  
    "learning_rate": 0.05,
    "n_estimators": 250,  
    "colsample_bytree": 0.8,
    "colsample_bynode": 0.8,
    "verbose": -1,
    "random_state": SEED,
    "reg_alpha": 0.1,
    "reg_lambda": 10,
    "extra_trees":True,
    'num_leaves':64,
    "device": device, 
}

In [78]:
fitted_models_cb = []
fitted_models_lgb = []
fitted_models_lgb2 = []
fitted_models_eclf = []
cv_scores_cb = []
cv_scores_lgb = []
cv_scores_lgb2 = []
cv_scores_eclf = []

meta_features = np.zeros((len(X), 4))  # 3モデル分の予測を格納
meta_target = np.zeros(len(X))

for idx_train, idx_valid in cv.split(X, y, groups=weeks):
    X_train, y_train = X.iloc[idx_train], y.iloc[idx_train] 
    X_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]
    train_pool = Pool(X_train, y_train, cat_features=cat_cols)
    val_pool = Pool(X_valid, y_valid, cat_features=cat_cols)

    clf_cb = CatBoostClassifier(
        best_model_min_trees = 1000,
        boosting_type = "Plain",
        eval_metric = "AUC",
        iterations = est_cnt,
        learning_rate = 0.05,
        l2_leaf_reg = 10,
        max_leaves = 64,
        random_seed = SEED,
        task_type = "GPU",
        use_best_model = True
    )
    clf_cb.fit(train_pool, eval_set=val_pool,verbose=300)
    fitted_models_cb.append(clf_cb)
    y_pred_valid = clf_cb.predict_proba(X_valid)[:,1]
    auc_score = roc_auc_score(y_valid, y_pred_valid)
    cv_scores_cb.append(auc_score)
  
    X_train[cat_cols] = X_train[cat_cols].astype("category")
    X_valid[cat_cols] = X_valid[cat_cols].astype("category")
    
    clf_lgb = LGBMClassifier(**params_lgb)
    clf_lgb.fit(
        X_train, y_train,
        eval_set = [(X_valid, y_valid)],
        callbacks = [lgb.log_evaluation(200), lgb.early_stopping(60)] )
    
    fitted_models_lgb.append(clf_lgb)
    y_pred_valid_lgb = clf_lgb.predict_proba(X_valid)[:,1]
    auc_score = roc_auc_score(y_valid, y_pred_valid_lgb)
    cv_scores_lgb.append(auc_score)
    
    clf_lgb2 = LGBMClassifier(**params_lgb2)
    clf_lgb2.fit(
        X_train, y_train,
        eval_set = [(X_valid, y_valid)],
        callbacks = [lgb.log_evaluation(200), lgb.early_stopping(60)] )
    
    fitted_models_lgb2.append(clf_lgb2)
    y_pred_valid_lgb2 = clf_lgb2.predict_proba(X_valid)[:,1]
    auc_score = roc_auc_score(y_valid, y_pred_valid_lgb2)
    cv_scores_lgb2.append(auc_score)
 
    eclf = VotingClassifier(
     estimators=[('lgb', clf_lgb), ('lgb2', clf_lgb2)],
     voting='soft', weights=[1, 1])   
    eclf = eclf.fit(X_train, y_train)
    fitted_models_eclf.append(eclf)
    y_pred_valid_eclf = eclf.predict_proba(X_valid)[:,1]
    auc_score = roc_auc_score(y_valid, y_pred_valid_eclf)
    cv_scores_eclf.append(auc_score)
    
    meta_features[idx_valid, 0] = y_pred_valid_lgb
    meta_features[idx_valid, 1] = y_pred_valid_lgb2
    meta_features[idx_valid, 2] = y_pred_valid_eclf
    meta_target[idx_valid] = y_valid

Training until validation scores don't improve for 60 rounds
[200]	valid_0's auc: 0.758149
Early stopping, best iteration is:
[151]	valid_0's auc: 0.769162
Training until validation scores don't improve for 60 rounds
Early stopping, best iteration is:
[139]	valid_0's auc: 0.764897
Training until validation scores don't improve for 60 rounds
[200]	valid_0's auc: 0.742781
Early stopping, best iteration is:
[177]	valid_0's auc: 0.74493
Training until validation scores don't improve for 60 rounds
Early stopping, best iteration is:
[83]	valid_0's auc: 0.750814
Training until validation scores don't improve for 60 rounds
Early stopping, best iteration is:
[68]	valid_0's auc: 0.749111
Training until validation scores don't improve for 60 rounds
Early stopping, best iteration is:
[16]	valid_0's auc: 0.74381
Training until validation scores don't improve for 60 rounds
Early stopping, best iteration is:
[100]	valid_0's auc: 0.775159
Training until validation scores don't improve for 60 rounds
Ea

In [79]:
print("CatBoost")   
#print("CV AUC scores: ", cv_scores_cb)
#print("Maximum CV AUC score: ", max(cv_scores_cb))
print("LightGBM")
print("CV AUC scores: ", cv_scores_lgb)
print("Maximum CV AUC score: ", max(cv_scores_lgb))
print("LightGBM_goss")
print("CV AUC scores: ", cv_scores_lgb2)
print("Maximum CV AUC score: ", max(cv_scores_lgb2))
print("Ensemble of LGBM and LGBM_goss")
print("CV AUC scores: ", cv_scores_eclf)
print("Maximum CV AUC score: ", max(cv_scores_eclf))

CatBoost
LightGBM
CV AUC scores:  [0.7691622103386809, 0.7449303179504297, 0.7491113922525453, 0.7751586893120603, 0.8289500599047369]
Maximum CV AUC score:  0.8289500599047369
LightGBM_goss
CV AUC scores:  [0.7648968678380443, 0.7508136526746224, 0.7438098680643412, 0.8029850746268656, 0.8289500599047369]
Maximum CV AUC score:  0.8289500599047369
Ensemble of LGBM and LGBM_goss
CV AUC scores:  [0.764514896867838, 0.744700826170408, 0.7446834146635339, 0.7836335563561503, 0.7761608369130066]
Maximum CV AUC score:  0.7836335563561503


In [80]:
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split

meta_params_lgb = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "learning_rate": 0.05,
    "n_estimators": 250,  
    "verbose": -1,
    "random_state": SEED,
    "extra_trees":True,
    "device": device, 
}
meta_models_lgb = []
for idx_train, idx_valid in cv.split(meta_features, meta_target, groups=weeks):
    X_train, y_train = meta_features[idx_train], meta_target[idx_train] 
    X_valid, y_valid = meta_features[idx_valid], meta_target[idx_valid]
    
    clf_lgb = LGBMClassifier(**meta_params_lgb)
    clf_lgb.fit(
        X_train, y_train,
        eval_set = [(X_valid, y_valid)],
        callbacks = [lgb.log_evaluation(200), lgb.early_stopping(60)] )
    
    meta_models_lgb.append(clf_lgb)
    y_pred_valid_lgb = clf_lgb.predict_proba(X_valid)[:,1]
    auc_score = roc_auc_score(y_valid, y_pred_valid_lgb)
    cv_scores_lgb.append(auc_score)


Training until validation scores don't improve for 60 rounds
Early stopping, best iteration is:
[68]	valid_0's auc: 0.774287
Training until validation scores don't improve for 60 rounds
Early stopping, best iteration is:
[78]	valid_0's auc: 0.757125
Training until validation scores don't improve for 60 rounds
Early stopping, best iteration is:
[8]	valid_0's auc: 0.762953
Training until validation scores don't improve for 60 rounds
Early stopping, best iteration is:
[58]	valid_0's auc: 0.808303
Training until validation scores don't improve for 60 rounds
Early stopping, best iteration is:
[16]	valid_0's auc: 0.824523


In [81]:
from sklearn.base import BaseEstimator, RegressorMixin
class VotingModel(BaseEstimator, RegressorMixin):
    def __init__(self, estimators):
        super().__init__()
        self.estimators = estimators
        
    def fit(self, X, y=None):
        return self
    
    def predict(self, X):
        y_preds = [estimator.predict(X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)
    
    def predict_proba(self, X):
        X[cat_cols] = X[cat_cols].astype("category")
        y_preds = [estimator.predict_proba(X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)

model = VotingModel(meta_models_lgb)

# Submission

In [82]:
import pandas as pd
import numpy as np

# テストデータの前処理
df_test = df_test.drop(columns=["week_num"])
df_test[cat_cols] = df_test[cat_cols].astype("category")
df_test = df_test.set_index("case_id")

# メタフィーチャーの初期化
meta_features_test = np.zeros((len(df_test), 3))

# 各ベースモデルで予測を実行してメタフィーチャーを生成
for model in fitted_models_lgb:
    preds_lgb = model.predict_proba(df_test)[:, 1]
    meta_features_test[:, 0] += preds_lgb / len(fitted_models_lgb)

for model in fitted_models_lgb2:
    preds_lgb2 = model.predict_proba(df_test)[:, 1]
    meta_features_test[:, 1] += preds_lgb2 / len(fitted_models_lgb2)

for model in fitted_models_eclf:
    preds_eclf = model.predict_proba(df_test)[:, 1]
    meta_features_test[:, 2] += preds_eclf / len(fitted_models_eclf)

# メタモデルを使用して最終予測を実行
final_predictions = np.zeros(len(df_test))
for model in meta_models_lgb:
    final_preds = model.predict_proba(meta_features_test)[:, 1]
    final_predictions += final_preds / len(meta_models_lgb)

# 提出用データフレームの準備
df_subm = pd.read_csv(ROOT / "sample_submission.csv")
df_subm = df_subm.set_index("case_id")
df_subm["score"] = pd.Series(final_predictions, index=df_test.index)

# 提出ファイルの保存
df_subm.to_csv("submission.csv")

# 提出データの表示
df_subm


Unnamed: 0_level_0,score
case_id,Unnamed: 1_level_1
57543,0.023743
57549,0.067651
57551,0.021129
57552,0.059764
57569,0.100121
57630,0.066347
57631,0.025007
57632,0.026529
57633,0.023449
57634,0.027237
