In [78]:
import os
import gc
from glob import glob
from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd
import polars as pl

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedGroupKFold
from sklearn.base import BaseEstimator, ClassifierMixin

import lightgbm as lgb

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [79]:
class VotingModel(BaseEstimator, ClassifierMixin):
    def __init__(self, estimators):
        super().__init__()
        self.estimators = estimators
        
    def fit(self, X, y=None):
        return self
    
    def predict(self, X):
        y_preds = [estimator.predict(X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)
    
    def predict_proba(self, X):
        y_preds = [estimator.predict_proba(X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)

In [80]:
# pre processing
def drop_high_null_columns(df, threshold=0.5):
    null_percentages = df.select([(pl.col(col).is_null().mean()).alias(col) for col in df.columns])
    cols_to_drop = [
        col
        for col in null_percentages.columns
        if null_percentages.select(pl.col(col)).item() > threshold
    ]
    df = df.drop(cols_to_drop)
    return df

# def impute_missing_values(df):
#     # Iterate over each column
#     for col in df.columns:
#         # Check if the column has null values
#         if df[col].null_count() > 0:
#             # If the column is numerical, fill null values with the mean
#             if df[col].dtype == pl.Float64:
#                 mean_value = df[col].mean()
#                 df[col] = df[col].fill_null(mean_value)
#             # If the column is categorical, fill null values with the mode
#             elif df[col].dtype == pl.Utf8:
#                 mode_value = df[col].mode()[0]
#                 df[col] = df[col].fill_null(mode_value)
#     return df

# def impute_missing_values(df):
#     # Iterate over each column
#     for col in df.columns:
#         # Check if the column has null values
#         if df[col].null_count() > 0:
#             # If the column is numerical, fill null values with the mean
#             if df[col].dtype == pl.Float64:
#                 mean_value = df[col].mean()
#                 df = df.with_column(col, df[col].fill_null(mean_value))
#             # If the column is categorical, fill null values with the mode
#             elif df[col].dtype == pl.Utf8:
#                 mode_value = df[col].mode()[0]
#                 df = df.with_column(col, df[col].fill_null(mode_value))
#     return df

# def drop_low_high_cardinality_columns(df, low_threshold=1, high_threshold=1000):
#     unique_counts = df.select([pl.col(col).n_unique().alias(col) for col in df.columns if df[col].dtype == pl.Utf8])
#     cols_to_drop = []
#     for col in unique_counts.columns:
#         unique_count = unique_counts.select(pl.col(col)).item()
#         if unique_count == low_threshold or unique_count > high_threshold:
#             cols_to_drop.append(col)
    
#     df = df.drop(cols_to_drop)
#     return df

# def impute_missing_values(df):
#     # Iterate over each column
#     for col in df.columns:
#         # Check if the column has null values
#         if df[col].null_count() > 0:
#             # If the column is numerical, fill null values with the mean
#             if df[col].dtype == pl.Float64:
#                 mean_value = df[col].mean()
#                 df = df.with_columns([(col, df[col].fill_null(mean_value))])
#             # If the column is categorical, fill null values with the mode
#             elif df[col].dtype == pl.Utf8:
#                 mode_value = df[col].mode()[0]
#                 df = df.with_columns([(col, df[col].fill_null(mode_value))])
#     return df




In [81]:
class Pipeline:
    @staticmethod
    def set_table_dtypes(df):
        for col in df.columns:
            if col in ["case_id", "WEEK_NUM", "num_group1", "num_group2"]:
                df = df.with_columns(pl.col(col).cast(pl.Int32))
            elif col in ["date_decision"]:
                df = df.with_columns(pl.col(col).cast(pl.Date))
            elif col[-1] in ("P", "A"):
                df = df.with_columns(pl.col(col).cast(pl.Float64))
            elif col[-1] in ("M",):
                df = df.with_columns(pl.col(col).cast(pl.String))
            elif col[-1] in ("D",):
                df = df.with_columns(pl.col(col).cast(pl.Date))            

        return df
    
    @staticmethod
    def handle_dates(df):
        for col in df.columns:
            if col[-1] in ("D",):
                df = df.with_columns(pl.col(col) - pl.col("date_decision"))
                df = df.with_columns(pl.col(col).dt.total_days())
                df = df.with_columns(pl.col(col).cast(pl.Float32))
                
        df = df.drop("date_decision", "MONTH")

        return df
    
    @staticmethod
    def filter_cols(df):
        # List of attributes to exclude

        # manual select the columns to remove
        excluded_attributes = [
            "addres_district_368M", "addres_role_871L", "addres_zip_823M",
            "amtinstpaidbefduel24m_4187115A", "annualeffectiverate_199L", "annuity_853A",
            "applicationcnt_361L", "applications30d_658L", "applicationscnt_1086L",
            "applicationscnt_464L", "applicationscnt_629L", "applicationscnt_867L",
            "approvaldate_319D", "assignmentdate_238D", "assignmentdate_4527235D",
            "assignmentdate_4955616D", "bankacctype_710L", "birth_259D", "birthdate_574D",
            "birthdate_87D", "byoccupationinc_3656910L", "cardtype_51L", "childnum_21L",
            "classificationofcontr_1114M", "classificationofcontr_13M", "classificationofcontr_400M",
            "clientscnt_100L", "clientscnt_1022L", "clientscnt_1071L", "clientscnt_1130L",
            "clientscnt_136L", "clientscnt_157L", "clientscnt_257L", "clientscnt_304L",
            "clientscnt_360L", "clientscnt_493L", "clientscnt_533L", "clientscnt_887L",
            "clientscnt_946L", "clientscnt12m_3712952L", "clientscnt3m_3712950L",
            "clientscnt6m_3712949L", "cntpmts24_3658933L", "collater_typofvalofguarant_298M",
            "collater_typofvalofguarant_407M", "collaterals_typeofguarante_359M",
            "collaterals_typeofguarante_669M", "contaddr_district_15M", "contaddr_matchlist_1032L",
            "contaddr_smempladdr_334L", "contaddr_zipcode_807M", "contractdate_551D",
            "contractenddate_991D", "contractmaturitydate_151D", "contractst_964M",
            "contractsum_5085717L", "contracttype_653M", "conts_role_79M", "conts_type_509L",
            "creationdate_885D", "credlmt_228A", "credlmt_230A", "credor_3940957M", "credtype_322L",
            "credtype_587L", "dateactivated_425D", "datefirstoffer_1144D", "datelastinstal40dpd_247D",
            "datelastunpaid_3546854D", "dateofcredend_289D", "dateofcredend_353D", "dateofcredstart_181D",
            "dateofcredstart_739D", "dateofrealrepmt_138D", "deductiondate_4917603D", "description_351M",
            "description_5085714M", "disbursementtype_67L", "district_544M", "dpdmaxdatemonth_804T",
            "dpdmaxdatemonth_89T", "dpdmaxdateyear_596T", "dpdmaxdateyear_742T", "dpdmaxdateyear_896T",
            "dtlastpmt_581D", "dtlastpmtallstes_3545839D", "dtlastpmtallstes_4499206D", "education_1103M",
            "education_1138M", "education_88M", "education_927M", "eir_270L", "empl_employedfrom_271D",
            "empladdr_district_926M", "empladdr_zipcode_114M", "employedfrom_700D", "employername_160M",
            "empls_employedfrom_796D", "empls_employer_name_740M", "familystate_447L", "familystate_726L",
            "financialinstitution_382M", "financialinstitution_591M", "firstclxcampaign_1125D",
            "firstdatedue_489D", "firstnonzeroinstldate_307D", "fourthquarter_440L", "gender_992L",
            "housetype_905L", "housingtype_772L", "incometype_1044T", "inittransactioncode_186L",
            "inittransactioncode_279L", "isbidproduct_1095L", "isbidproduct_390L", "isbidproductrequest_292L",
            "isdebitcard_527L", "isdebitcard_729L", "language1_981M", "last180dayturnover_1134A",
            "last30dayturnover_651A", "lastactivateddate_801D", "lastapplicationdate_877D",
            "lastapprcommoditycat_1041M", "lastapprcommoditytypec_5251766M", "lastapprdate_640D",
            "lastcancelreason_561M", "lastdelinqdate_224D", "lastrejectcommoditycat_161M",
            "lastrejectcommodtypec_5251769M", "lastrejectdate_50D", "lastrejectreason_759M",
            "lastrejectreasonclient_4145040M", "lastrepayingdate_696D", "lastupdate_1112D",
            "lastupdate_260D", "lastupdate_388D", "maritalst_385M", "maritalst_703L", "maritalst_893M",
            "maxdpdinstldate_3546855D", "mobilephncnt_593L", "name_4527232M", "name_4917606M",
            "numberofoverdueinstlmaxdat_148D", "numberofoverdueinstlmaxdat_641D", "openingdate_313D",
            "openingdate_857D", "overdueamountmax2date_1002D", "overdueamountmax2date_1142D",
            "overdueamountmaxdatemonth_284T", "overdueamountmaxdatemonth_365T", "overdueamountmaxdatemonth_494T",
            "overdueamountmaxdateyear_2T", "overdueamountmaxdateyear_432T", "overdueamountmaxdateyear_994T",
            "paytype_783L", "paytype1st_925L", "payvacationpostpone_4187118D", "periodicityofpmts_1102L",
            "periodicityofpmts_837L", "personindex_1023L", "persontype_1072L", "persontype_792L",
            "posfpd10lastmonth_333P", "posfpd30lastmonth_3976960P", "posfstqpd30lastmonth_3976962P",
            "postype_4733339M", "previouscontdistrict_112M", "processingdate_168D", "purposeofcred_426M",
            "purposeofcred_722M", "purposeofcred_874M", "registaddr_district_1083M", "registaddr_zipcode_184M",
            "rejectreason_755M", "rejectreasonclient_4145042M", "relatedpersons_role_762T",
            "relationshiptoclient_415T", "relationshiptoclient_642T", "requesttype_4525192L",
            "responsedate_1012D", "responsedate_4527233D", "responsedate_4917613D", "role_1084L",
            "role_993L", "secondquarter_766L", "sellerplacecnt_915L", "sellerplacescnt_216L",
            "sex_738L", "subjectrole_182M", "subjectrole_326M", "subjectrole_43M", "subjectrole_93M",
            "thirdquarter_1082L", "twobodfilling_608L", "type_25L", "typesuite_864L", "validfrom_1069D"
        ]


        # Exclude the attributes
        df = df.drop(excluded_attributes, errors='ignore')
    
           # Print the number of columns left
        print("Number of columns left after excluding:", len(df.columns))

       # Preprocess the dataframe
        df = drop_high_null_columns(df)
        # Print the number of columns left
        print("Number of columns left after droping the null:", len(df.columns))
        #df = drop_low_high_cardinality_columns(df)

        # print the number of columns left
        #print("Number of columns left after droping the low and high cardinality columns:", len(df.columns))

        # Print the columns that have null values
        null_columns = [col for col in df.columns if df[col].null_count() > 0]
        print("Columns with null values:", null_columns)

                # Print the number of columns left
        print("Number of columns left after dropping the low and high cardinality columns:", len(df.columns))

        # Print the null values for each column
        for col in df.columns:
            null_count = df[col].null_count()
            print(f"Null values in '{col}': {null_count}")

        # df = impute_missing_values(df)

        if df[col].dtype == 'object' or df[col].dtype == 'category':
            mode = df[col].mode().iloc[0]
            df[col] = df[col].fillna(mode,inplace=True)
        else:
            mean = df[col].mean()
            df[col] = df[col].fillna(mean,inplace=True)

         # Print the null values for each column
        for col in df.columns:
            null_count = df[col].null_count()
            print(f"Null values in '{col}': {null_count}")
        
        return df

In [82]:
class Aggregator:
    @staticmethod
    def num_expr(df):
        cols = [col for col in df.columns if col[-1] in ("P", "A")]

        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]

        return expr_max

    @staticmethod
    def date_expr(df):
        cols = [col for col in df.columns if col[-1] in ("D",)]

        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]

        return expr_max

    @staticmethod
    def str_expr(df):
        cols = [col for col in df.columns if col[-1] in ("M",)]
        
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]

        return expr_max

    @staticmethod
    def other_expr(df):
        cols = [col for col in df.columns if col[-1] in ("T", "L")]
        
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]

        return expr_max
    
    @staticmethod
    def count_expr(df):
        cols = [col for col in df.columns if "num_group" in col]

        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]

        return expr_max

    @staticmethod
    def get_exprs(df):
        exprs = Aggregator.num_expr(df) + \
                Aggregator.date_expr(df) + \
                Aggregator.str_expr(df) + \
                Aggregator.other_expr(df) + \
                Aggregator.count_expr(df)

        return exprs

In [83]:
def read_file(path, depth=None):
    df = pl.read_parquet(path)
    df = df.pipe(Pipeline.set_table_dtypes)
    
    if depth in [1, 2]:
        df = df.group_by("case_id").agg(Aggregator.get_exprs(df))
    
    return df

def read_files(regex_path, depth=None):
    chunks = []
    for path in glob(str(regex_path)):
        df = pl.read_parquet(path)
        df = df.pipe(Pipeline.set_table_dtypes)
        
        if depth in [1, 2]:
            df = df.group_by("case_id").agg(Aggregator.get_exprs(df))
        
        chunks.append(df)
        
    df = pl.concat(chunks, how="vertical_relaxed")
    df = df.unique(subset=["case_id"])

    return df
    

In [84]:
def feature_eng(df_base, depth_0, depth_1, depth_2):
    df_base = (
        df_base
        .with_columns(
            month_decision = pl.col("date_decision").dt.month(),
            weekday_decision = pl.col("date_decision").dt.weekday(),
        )
    )
        
    for i, df in enumerate(depth_0 + depth_1 + depth_2):
        df_base = df_base.join(df, how="left", on="case_id", suffix=f"_{i}")
        
    df_base = df_base.pipe(Pipeline.handle_dates)
    
    return df_base

In [85]:
def to_pandas(df_data, cat_cols=None):
    df_data = df_data.to_pandas()
    
    if cat_cols is None:
        cat_cols = list(df_data.select_dtypes("object").columns)
    
    df_data[cat_cols] = df_data[cat_cols].astype("category")
    
    return df_data, cat_cols

In [86]:

##ROOT            = Path("/kaggle/input/home-credit-credit-risk-model-stability")
##TRAIN_DIR       = ROOT / "parquet_files" / "train"
##TEST_DIR        = ROOT / "parquet_files" / "test"

from pathlib import Path
TRAIN_DIR = Path("../parquet_files/train")
TEST_DIR = Path("../parquet_files/test")


In [87]:
data_store = {
    "df_base": read_file(TRAIN_DIR / "train_base.parquet"),
    "depth_0": [
        read_file(TRAIN_DIR / "train_static_cb_0.parquet"),
        read_files(TRAIN_DIR / "train_static_0_*.parquet"),
    ],
    "depth_1": [
        read_files(TRAIN_DIR / "train_applprev_1_*.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_a_1.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_b_1.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_c_1.parquet", 1),
        read_files(TRAIN_DIR / "train_credit_bureau_a_1_*.parquet", 1),
        read_file(TRAIN_DIR / "train_credit_bureau_b_1.parquet", 1),
        read_file(TRAIN_DIR / "train_other_1.parquet", 1),
        read_file(TRAIN_DIR / "train_person_1.parquet", 1),
        read_file(TRAIN_DIR / "train_deposit_1.parquet", 1),
        read_file(TRAIN_DIR / "train_debitcard_1.parquet", 1),
    ],
    "depth_2": [
        read_file(TRAIN_DIR / "train_credit_bureau_b_2.parquet", 2),
        read_files(TRAIN_DIR / "train_credit_bureau_a_2_*.parquet", 2),
    ]
}

In [88]:
df_train = feature_eng(**data_store)

print("train data shape:\t", df_train.shape)

train data shape:	 (1526659, 472)


In [89]:
data_store = {
    "df_base": read_file(TEST_DIR / "test_base.parquet"),
    "depth_0": [
        read_file(TEST_DIR / "test_static_cb_0.parquet"),
        read_files(TEST_DIR / "test_static_0_*.parquet"),
    ],
    "depth_1": [
        read_files(TEST_DIR / "test_applprev_1_*.parquet", 1),
        read_file(TEST_DIR / "test_tax_registry_a_1.parquet", 1),
        read_file(TEST_DIR / "test_tax_registry_b_1.parquet", 1),
        read_file(TEST_DIR / "test_tax_registry_c_1.parquet", 1),
        read_files(TEST_DIR / "test_credit_bureau_a_1_*.parquet", 1),
        read_file(TEST_DIR / "test_credit_bureau_b_1.parquet", 1),
        read_file(TEST_DIR / "test_other_1.parquet", 1),
        read_file(TEST_DIR / "test_person_1.parquet", 1),
        read_file(TEST_DIR / "test_deposit_1.parquet", 1),
        read_file(TEST_DIR / "test_debitcard_1.parquet", 1),
    ],
    "depth_2": [
        read_file(TEST_DIR / "test_credit_bureau_b_2.parquet", 2),
        read_files(TEST_DIR / "test_credit_bureau_a_2_*.parquet", 2),
    ]
}

In [90]:
df_test = feature_eng(**data_store)

print("test data shape:\t", df_test.shape)

test data shape:	 (10, 471)


In [91]:
# Convert train data to pandas and filter
df_train, cat_cols = to_pandas(df_train)
df_train_filtered = df_train.pipe(Pipeline.filter_cols)

# Convert test data to pandas and filter
df_test, cat_cols = to_pandas(df_test, cat_cols)
df_test_filtered = df_test.pipe(Pipeline.filter_cols)

# Ensure that both train and test data have the same columns except for the target attribute
common_columns = list(set(df_train_filtered.columns).intersection(set(df_test_filtered.columns)))

# Add specific columns to the common columns list if they are not already present
columns_to_keep = ['case_id', 'WEEK_NUM', 'month_decision', 'weekday_decision']
for col in columns_to_keep:
    if col not in common_columns:
        common_columns.append(col)

# Preserve the original order of columns in df_train_filtered
common_columns = [col for col in df_train_filtered.columns if col in common_columns]

# Add 'target' column to train data
df_train_filtered = df_train_filtered[common_columns + ['target']]

# Filter test data to include only common columns
df_test_filtered = df_test_filtered[common_columns]




KeyError: "['addres_district_368M', 'addres_role_871L', 'addres_zip_823M', 'amtinstpaidbefduel24m_4187115A', 'annualeffectiverate_199L', 'annuity_853A', 'applicationcnt_361L', 'applications30d_658L', 'applicationscnt_1086L', 'applicationscnt_464L', 'applicationscnt_629L', 'applicationscnt_867L', 'approvaldate_319D', 'assignmentdate_238D', 'assignmentdate_4527235D', 'assignmentdate_4955616D', 'bankacctype_710L', 'birth_259D', 'birthdate_574D', 'birthdate_87D', 'byoccupationinc_3656910L', 'cardtype_51L', 'childnum_21L', 'classificationofcontr_1114M', 'classificationofcontr_13M', 'classificationofcontr_400M', 'clientscnt_100L', 'clientscnt_1022L', 'clientscnt_1071L', 'clientscnt_1130L', 'clientscnt_136L', 'clientscnt_157L', 'clientscnt_257L', 'clientscnt_304L', 'clientscnt_360L', 'clientscnt_493L', 'clientscnt_533L', 'clientscnt_887L', 'clientscnt_946L', 'clientscnt12m_3712952L', 'clientscnt3m_3712950L', 'clientscnt6m_3712949L', 'cntpmts24_3658933L', 'collater_typofvalofguarant_298M', 'collater_typofvalofguarant_407M', 'collaterals_typeofguarante_359M', 'collaterals_typeofguarante_669M', 'contaddr_district_15M', 'contaddr_matchlist_1032L', 'contaddr_smempladdr_334L', 'contaddr_zipcode_807M', 'contractdate_551D', 'contractenddate_991D', 'contractmaturitydate_151D', 'contractst_964M', 'contractsum_5085717L', 'contracttype_653M', 'conts_role_79M', 'conts_type_509L', 'creationdate_885D', 'credlmt_228A', 'credlmt_230A', 'credor_3940957M', 'credtype_322L', 'credtype_587L', 'dateactivated_425D', 'datefirstoffer_1144D', 'datelastinstal40dpd_247D', 'datelastunpaid_3546854D', 'dateofcredend_289D', 'dateofcredend_353D', 'dateofcredstart_181D', 'dateofcredstart_739D', 'dateofrealrepmt_138D', 'deductiondate_4917603D', 'description_351M', 'description_5085714M', 'disbursementtype_67L', 'district_544M', 'dpdmaxdatemonth_804T', 'dpdmaxdatemonth_89T', 'dpdmaxdateyear_596T', 'dpdmaxdateyear_742T', 'dpdmaxdateyear_896T', 'dtlastpmt_581D', 'dtlastpmtallstes_3545839D', 'dtlastpmtallstes_4499206D', 'education_1103M', 'education_1138M', 'education_88M', 'education_927M', 'eir_270L', 'empl_employedfrom_271D', 'empladdr_district_926M', 'empladdr_zipcode_114M', 'employedfrom_700D', 'employername_160M', 'empls_employedfrom_796D', 'empls_employer_name_740M', 'familystate_447L', 'familystate_726L', 'financialinstitution_382M', 'financialinstitution_591M', 'firstclxcampaign_1125D', 'firstdatedue_489D', 'firstnonzeroinstldate_307D', 'fourthquarter_440L', 'gender_992L', 'housetype_905L', 'housingtype_772L', 'incometype_1044T', 'inittransactioncode_186L', 'inittransactioncode_279L', 'isbidproduct_1095L', 'isbidproduct_390L', 'isbidproductrequest_292L', 'isdebitcard_527L', 'isdebitcard_729L', 'language1_981M', 'last180dayturnover_1134A', 'last30dayturnover_651A', 'lastactivateddate_801D', 'lastapplicationdate_877D', 'lastapprcommoditycat_1041M', 'lastapprcommoditytypec_5251766M', 'lastapprdate_640D', 'lastcancelreason_561M', 'lastdelinqdate_224D', 'lastrejectcommoditycat_161M', 'lastrejectcommodtypec_5251769M', 'lastrejectdate_50D', 'lastrejectreason_759M', 'lastrejectreasonclient_4145040M', 'lastrepayingdate_696D', 'lastupdate_1112D', 'lastupdate_260D', 'lastupdate_388D', 'maritalst_385M', 'maritalst_703L', 'maritalst_893M', 'maxdpdinstldate_3546855D', 'mobilephncnt_593L', 'name_4527232M', 'name_4917606M', 'numberofoverdueinstlmaxdat_148D', 'numberofoverdueinstlmaxdat_641D', 'openingdate_313D', 'openingdate_857D', 'overdueamountmax2date_1002D', 'overdueamountmax2date_1142D', 'overdueamountmaxdatemonth_284T', 'overdueamountmaxdatemonth_365T', 'overdueamountmaxdatemonth_494T', 'overdueamountmaxdateyear_2T', 'overdueamountmaxdateyear_432T', 'overdueamountmaxdateyear_994T', 'paytype_783L', 'paytype1st_925L', 'payvacationpostpone_4187118D', 'periodicityofpmts_1102L', 'periodicityofpmts_837L', 'personindex_1023L', 'persontype_1072L', 'persontype_792L', 'posfpd10lastmonth_333P', 'posfpd30lastmonth_3976960P', 'posfstqpd30lastmonth_3976962P', 'postype_4733339M', 'previouscontdistrict_112M', 'processingdate_168D', 'purposeofcred_426M', 'purposeofcred_722M', 'purposeofcred_874M', 'registaddr_district_1083M', 'registaddr_zipcode_184M', 'rejectreason_755M', 'rejectreasonclient_4145042M', 'relatedpersons_role_762T', 'relationshiptoclient_415T', 'relationshiptoclient_642T', 'requesttype_4525192L', 'responsedate_1012D', 'responsedate_4527233D', 'responsedate_4917613D', 'role_1084L', 'role_993L', 'secondquarter_766L', 'sellerplacecnt_915L', 'sellerplacescnt_216L', 'sex_738L', 'subjectrole_182M', 'subjectrole_326M', 'subjectrole_43M', 'subjectrole_93M', 'thirdquarter_1082L', 'twobodfilling_608L', 'type_25L', 'typesuite_864L', 'validfrom_1069D'] not found in axis"

In [None]:
# df_train, cat_cols = to_pandas(df_train)
# df_test, cat_cols = to_pandas(df_test, cat_cols)

In [None]:

del data_store
gc.collect()

In [None]:
print("Train is duplicated:\t", df_train_filtered["case_id"].duplicated().any())
print("Train Week Range:\t", (df_train_filtered["WEEK_NUM"].min(), df_train["WEEK_NUM"].max()))

print()

print("Test is duplicated:\t", df_test_filtered["case_id"].duplicated().any())
print("Test Week Range:\t", (df_test_filtered["WEEK_NUM"].min(), df_test["WEEK_NUM"].max()))

In [None]:
# sns.lineplot(
#     data=df_train,
#     x="WEEK_NUM",
#     y="target",
# )
# plt.show()

In [None]:
X = df_train_filtered.drop(columns=["target", "case_id", "WEEK_NUM"])
y = df_train_filtered["target"]
weeks = df_train_filtered["WEEK_NUM"]

cv = StratifiedGroupKFold(n_splits=5, shuffle=False)

params = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "max_depth": 8,
    "learning_rate": 0.05,
    "n_estimators": 1000,
    "colsample_bytree": 0.8, 
    "colsample_bynode": 0.8,
    "verbose": -1,
    "random_state": 42,
    "class_weight": "balanced",
    #"device": "gpu",
}

fitted_models = []

for idx_train, idx_valid in cv.split(X, y, groups=weeks):
    X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
    X_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]

    model = lgb.LGBMClassifier(**params)
    model.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        callbacks=[lgb.log_evaluation(100), lgb.early_stopping(100)]
    )

    fitted_models.append(model)

model = VotingModel(fitted_models)

In [None]:
X_test = df_test_filtered.drop(columns=["WEEK_NUM"])
X_test = X_test.set_index("case_id")

y_pred = pd.Series(model.predict_proba(X_test)[:, 1], index=X_test.index)

In [None]:
df_subm = pd.read_csv("sample_submission.csv")
df_subm = df_subm.set_index("case_id")

df_subm["score"] = y_pred

In [None]:
print("Check null: ", df_subm["score"].isnull().any())

df_subm_sorted = df_subm.sort_values(by="score", ascending=False)
top_5_scores = df_subm_sorted.head()
print(top_5_scores)

In [None]:
df_subm.to_csv("submission.csv")