# Example Notebook

Welcome to the example notebook for the Home Credit Kaggle competition. The goal of this competition is to determine how likely a customer is going to default on an issued loan. The main difference between the [first](https://www.kaggle.com/c/home-credit-default-risk) and this competition is that now your submission will be scored with a custom metric that will take into account how well the model performs in future. A decline in performance will be penalized. The goal is to create a model that is stable and performs well in the future.

목적: 고객의 대출 불이행 가능성

In this notebook you will see how to:
* Load the data
* Join tables with Polars - a DataFrame library implemented in Rust language, designed to be blazingy fast and memory efficient.  
* Create simple aggregation features
* Train a LightGBM model
* Create a submission table

## Load the data

In [4]:
import polars as pl
import numpy as np
import pandas as pd
import lightgbm as lgb
import optuna
from optuna import Trial
from optuna.samplers import TPESampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score 

import polars.selectors as cs
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import missingno as mn

# dataPath = "/kaggle/input/home-credit-credit-risk-model-stability/"
dataPath = "../data/"

In [5]:
def set_table_dtypes(df: pl.DataFrame) -> pl.DataFrame:
    # implement here all desired dtypes for tables
    # the following is just an example
    for col in df.columns:
        # last letter of column name will help you determine the type
        if col[-1] in ("P", "A"):
            df = df.with_columns(pl.col(col).cast(pl.Float64).alias(col))
    return df

def convert_strings(df: pd.DataFrame) -> pd.DataFrame:
    for col in df.columns:  
        if df[col].dtype.name in ['object', 'string']:
            df[col] = df[col].astype("string").astype('category')
            current_categories = df[col].cat.categories
            new_categories = current_categories.to_list() + ["Unknown"]
            new_dtype = pd.CategoricalDtype(categories=new_categories, ordered=True)
            df[col] = df[col].astype(new_dtype)
    return df

# Load feature_def

In [6]:
def get_feature_definitions(columns):
    return pl.DataFrame({'Variable': columns}).join(
        feature_def,
        on = 'Variable',
        how = 'left',
    )

feature_def = pl.read_csv(dataPath + "feature_definitions.csv")

# Load Basetable

In [7]:
# train
train_basetable = pl.read_csv(dataPath + "csv_files/train/train_base.csv")

# test
test_basetable = pl.read_csv(dataPath + "csv_files/test/test_base.csv")

In [8]:
print(train_basetable.columns)
print('\n')
print(train_basetable.shape)
print('\n')
display(train_basetable.head())

['case_id', 'date_decision', 'MONTH', 'WEEK_NUM', 'target']


(1526659, 5)




case_id,date_decision,MONTH,WEEK_NUM,target
i64,str,i64,i64,i64
0,"""2019-01-03""",201901,0,0
1,"""2019-01-03""",201901,0,0
2,"""2019-01-04""",201901,0,0
3,"""2019-01-03""",201901,0,0
4,"""2019-01-04""",201901,0,1


# depth=0

In [9]:
# train
train_static = pl.concat(
    [
        pl.read_csv(dataPath + "csv_files/train/train_static_0_0.csv").pipe(set_table_dtypes),
        pl.read_csv(dataPath + "csv_files/train/train_static_0_1.csv").pipe(set_table_dtypes),
    ],
    how="vertical_relaxed",
)
train_static_cb = pl.read_csv(dataPath + "csv_files/train/train_static_cb_0.csv").pipe(set_table_dtypes)

# test
test_static = pl.concat(
    [
        pl.read_csv(dataPath + "csv_files/test/test_static_0_0.csv").pipe(set_table_dtypes),
        pl.read_csv(dataPath + "csv_files/test/test_static_0_1.csv").pipe(set_table_dtypes),
        pl.read_csv(dataPath + "csv_files/test/test_static_0_2.csv").pipe(set_table_dtypes),
    ],
    how="vertical_relaxed",
)
test_static_cb = pl.read_csv(dataPath + "csv_files/test/test_static_cb_0.csv").pipe(set_table_dtypes)

In [10]:
print(train_static.columns)
print('\n')
print(train_static.shape)
print('\n')
display(train_static.head())

['case_id', 'actualdpdtolerance_344P', 'amtinstpaidbefduel24m_4187115A', 'annuity_780A', 'annuitynextmonth_57A', 'applicationcnt_361L', 'applications30d_658L', 'applicationscnt_1086L', 'applicationscnt_464L', 'applicationscnt_629L', 'applicationscnt_867L', 'avgdbddpdlast24m_3658932P', 'avgdbddpdlast3m_4187120P', 'avgdbdtollast24m_4525197P', 'avgdpdtolclosure24_3658938P', 'avginstallast24m_3658937A', 'avglnamtstart24m_4525187A', 'avgmaxdpdlast9m_3716943P', 'avgoutstandbalancel6m_4187114A', 'avgpmtlast12m_4525200A', 'bankacctype_710L', 'cardtype_51L', 'clientscnt12m_3712952L', 'clientscnt3m_3712950L', 'clientscnt6m_3712949L', 'clientscnt_100L', 'clientscnt_1022L', 'clientscnt_1071L', 'clientscnt_1130L', 'clientscnt_136L', 'clientscnt_157L', 'clientscnt_257L', 'clientscnt_304L', 'clientscnt_360L', 'clientscnt_493L', 'clientscnt_533L', 'clientscnt_887L', 'clientscnt_946L', 'cntincpaycont9m_3716944L', 'cntpmts24_3658933L', 'commnoinclast6m_3546845L', 'credamount_770A', 'credtype_322L', 'cur

case_id,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,applicationcnt_361L,applications30d_658L,applicationscnt_1086L,applicationscnt_464L,applicationscnt_629L,applicationscnt_867L,avgdbddpdlast24m_3658932P,avgdbddpdlast3m_4187120P,avgdbdtollast24m_4525197P,avgdpdtolclosure24_3658938P,avginstallast24m_3658937A,avglnamtstart24m_4525187A,avgmaxdpdlast9m_3716943P,avgoutstandbalancel6m_4187114A,avgpmtlast12m_4525200A,bankacctype_710L,cardtype_51L,clientscnt12m_3712952L,clientscnt3m_3712950L,clientscnt6m_3712949L,clientscnt_100L,clientscnt_1022L,clientscnt_1071L,clientscnt_1130L,clientscnt_136L,clientscnt_157L,clientscnt_257L,clientscnt_304L,clientscnt_360L,clientscnt_493L,clientscnt_533L,clientscnt_887L,…,numinstpaidearlyest_4493214L,numinstpaidlastcontr_4325080L,numinstpaidlate1d_3546852L,numinstregularpaid_973L,numinstregularpaidest_4493210L,numinsttopaygr_769L,numinsttopaygrest_4493213L,numinstunpaidmax_3546851L,numinstunpaidmaxest_4493212L,numnotactivated_1143L,numpmtchanneldd_318L,numrejects9m_859L,opencred_647L,paytype1st_925L,paytype_783L,payvacationpostpone_4187118D,pctinstlsallpaidearl3d_427L,pctinstlsallpaidlat10d_839L,pctinstlsallpaidlate1d_3546856L,pctinstlsallpaidlate4d_3546849L,pctinstlsallpaidlate6d_3546844L,pmtnum_254L,posfpd10lastmonth_333P,posfpd30lastmonth_3976960P,posfstqpd30lastmonth_3976962P,previouscontdistrict_112M,price_1097A,sellerplacecnt_915L,sellerplacescnt_216L,sumoutstandtotal_3546847A,sumoutstandtotalest_4493215A,totaldebt_9A,totalsettled_863A,totinstallast1m_4525188A,twobodfilling_608L,typesuite_864L,validfrom_1069D
i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,f64,f64,f64,f64,f64,f64,f64,str,f64,f64,f64,f64,f64,f64,f64,…,str,str,f64,f64,str,f64,str,f64,str,f64,f64,f64,bool,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,f64,f64,f64,f64,f64,f64,f64,f64,str,str,str
0,,,1917.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,,,,,,,,,,0.0,0.0,0.0,,"""OTHER""","""OTHER""",,,,,,,24.0,0.0,0.0,,"""a55475b1""",,0.0,0.0,,,0.0,0.0,,"""BO""",,
1,,,3134.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"""0.0""",3.0,0.0,0.0,0.0,0.0,0.0,0.0,…,,,,,,,,,,0.0,0.0,0.0,,"""OTHER""","""OTHER""",,,,,,,18.0,0.0,0.0,,"""a55475b1""",,0.0,0.0,,,0.0,0.0,,"""BO""",,
2,,,4937.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,,,,,,,,,,0.0,0.0,0.0,False,"""OTHER""","""OTHER""",,,,,,,36.0,0.0,0.0,,"""a55475b1""",,0.0,0.0,,,0.0,0.0,,"""BO""","""AL""",
3,,,4643.6,0.0,0.0,1.0,0.0,2.0,0.0,1.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,,,,,,,,,,0.0,0.0,1.0,False,"""OTHER""","""OTHER""",,,,,,,12.0,0.0,0.0,,"""a55475b1""",,1.0,1.0,,,0.0,0.0,,"""BO""","""AL""",
4,,,3390.2,0.0,0.0,1.0,0.0,0.0,0.0,1.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,,,,,,,,,,0.0,0.0,0.0,False,"""OTHER""","""OTHER""",,,,,,,24.0,0.0,0.0,,"""a55475b1""",,0.0,0.0,,,0.0,0.0,,"""BO""","""AL""",


In [11]:
print(train_static_cb.columns)
print('\n')
print(train_static_cb.shape)
print('\n')
display(train_static_cb.head())

['case_id', 'assignmentdate_238D', 'assignmentdate_4527235D', 'assignmentdate_4955616D', 'birthdate_574D', 'contractssum_5085716L', 'dateofbirth_337D', 'dateofbirth_342D', 'days120_123L', 'days180_256L', 'days30_165L', 'days360_512L', 'days90_310L', 'description_5085714M', 'education_1103M', 'education_88M', 'firstquarter_103L', 'for3years_128L', 'for3years_504L', 'for3years_584L', 'formonth_118L', 'formonth_206L', 'formonth_535L', 'forquarter_1017L', 'forquarter_462L', 'forquarter_634L', 'fortoday_1092L', 'forweek_1077L', 'forweek_528L', 'forweek_601L', 'foryear_618L', 'foryear_818L', 'foryear_850L', 'fourthquarter_440L', 'maritalst_385M', 'maritalst_893M', 'numberofqueries_373L', 'pmtaverage_3A', 'pmtaverage_4527227A', 'pmtaverage_4955615A', 'pmtcount_4527229L', 'pmtcount_4955617L', 'pmtcount_693L', 'pmtscount_423L', 'pmtssum_45A', 'requesttype_4525192L', 'responsedate_1012D', 'responsedate_4527233D', 'responsedate_4917613D', 'riskassesment_302T', 'riskassesment_940T', 'secondquarter

case_id,assignmentdate_238D,assignmentdate_4527235D,assignmentdate_4955616D,birthdate_574D,contractssum_5085716L,dateofbirth_337D,dateofbirth_342D,days120_123L,days180_256L,days30_165L,days360_512L,days90_310L,description_5085714M,education_1103M,education_88M,firstquarter_103L,for3years_128L,for3years_504L,for3years_584L,formonth_118L,formonth_206L,formonth_535L,forquarter_1017L,forquarter_462L,forquarter_634L,fortoday_1092L,forweek_1077L,forweek_528L,forweek_601L,foryear_618L,foryear_818L,foryear_850L,fourthquarter_440L,maritalst_385M,maritalst_893M,numberofqueries_373L,pmtaverage_3A,pmtaverage_4527227A,pmtaverage_4955615A,pmtcount_4527229L,pmtcount_4955617L,pmtcount_693L,pmtscount_423L,pmtssum_45A,requesttype_4525192L,responsedate_1012D,responsedate_4527233D,responsedate_4917613D,riskassesment_302T,riskassesment_940T,secondquarter_766L,thirdquarter_1082L
i64,str,str,str,str,str,str,str,f64,f64,f64,f64,f64,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,f64,f64,f64,f64,str,str,f64,f64,f64,str,str,str,str,str,f64,f64,f64
357,,,,"""1988-04-01""",,,,,,,,,"""a55475b1""","""a55475b1""","""a55475b1""",,,,,,,,,,,,,,,,,,,"""a55475b1""","""a55475b1""",,,,,,,,6.0,6301.4,,"""2019-01-25""",,,,,,
381,,,,"""1973-11-01""",,,,,,,,,"""a55475b1""","""a55475b1""","""a55475b1""",,,,,,,,,,,,,,,,,,,"""a55475b1""","""a55475b1""",,,,,,,,6.0,4019.6,,"""2019-01-25""",,,,,,
388,,,,"""1989-04-01""",,"""1989-04-01""",,6.0,8.0,2.0,10.0,4.0,"""a55475b1""","""a55475b1""","""a55475b1""",2.0,,,,,,,,,,,,,,,,,6.0,"""a55475b1""","""a55475b1""",10.0,,,,,,,6.0,14548.0,,"""2019-01-28""",,,,,3.0,5.0
405,,,,"""1974-03-01""",,"""1974-03-01""",,0.0,0.0,0.0,1.0,0.0,"""a55475b1""","""a55475b1""","""a55475b1""",0.0,,,,,,,,,,,,,,,,,4.0,"""a55475b1""","""a55475b1""",1.0,,,,,,,6.0,10498.24,,"""2019-01-21""",,,,,2.0,0.0
409,,,,"""1993-06-01""",,"""1993-06-01""",,2.0,3.0,0.0,3.0,1.0,"""a55475b1""","""717ddd49""","""a55475b1""",4.0,,,,,,,,,,,,,,,,,1.0,"""a7fcb6e5""","""a55475b1""",3.0,,,,,,,7.0,6344.8804,,"""2019-01-21""",,,,,0.0,4.0


# depth=1에 해당하는 Internal file 
- `debitcard_1`
- `deposit_1`

In [12]:
# train
train_debitcard_1 = pl.read_csv(dataPath + "csv_files/train/train_debitcard_1.csv").pipe(set_table_dtypes)
train_deposit_1 = pl.read_csv(dataPath + "csv_files/train/train_deposit_1.csv").pipe(set_table_dtypes)

# test
test_debitcard_1 = pl.read_csv(dataPath + "csv_files/test/test_debitcard_1.csv").pipe(set_table_dtypes)
test_deposit_1 = pl.read_csv(dataPath + "csv_files/test/test_deposit_1.csv").pipe(set_table_dtypes)

In [13]:
print(train_debitcard_1.shape, train_deposit_1.shape)

(157302, 6) (145086, 5)


# Feature engineering

In this part, we can see a simple example of joining tables via `case_id`. Here the loading and joining is done with polars library. Polars library is blazingly fast and has much smaller memory footprint than pandas. 

## Custom
- `last180dayaveragebalance_704A` -> `max_last180dayaveragebalance_704A`
- `amount_416A` -> `sum_deposit_amount_A`

In [14]:
### debitcard ###
# case_id를 기준으로 그룹화
# aggregation functions: 체크카드의 평균 잔액의 최대치
train_debitcard_1_feats = train_debitcard_1.group_by("case_id").agg(
    pl.col("last180dayaveragebalance_704A").max().alias("max_last180dayaveragebalance_704A")
)

### deposit ###
# aggregation functions : 예/적금의 합계 
train_deposit_1_feats = train_deposit_1.group_by('case_id').agg(
    pl.sum('amount_416A').alias('sum_deposit_amount_A')
).sort(by='case_id')

# A, D 유형만 선택
selected_static_cols = []
for col in train_static.columns:
    if col[-1] in ("A", "D"):
        selected_static_cols.append(col)
print(selected_static_cols)

selected_static_cb_cols = []
for col in train_static_cb.columns:
    if col[-1] in ("A", "D"):
        selected_static_cb_cols.append(col)
print(selected_static_cb_cols)


# Join all tables together
data = train_basetable.join(
    train_static.select(["case_id"]+selected_static_cols), how="left", on="case_id"
).join(
    train_static_cb.select(["case_id"]+selected_static_cb_cols), how="left", on="case_id"
).join(
    train_debitcard_1_feats, how="left", on="case_id"
).join(
    train_deposit_1_feats, how="left", on="case_id"
)


['amtinstpaidbefduel24m_4187115A', 'annuity_780A', 'annuitynextmonth_57A', 'avginstallast24m_3658937A', 'avglnamtstart24m_4525187A', 'avgoutstandbalancel6m_4187114A', 'avgpmtlast12m_4525200A', 'credamount_770A', 'currdebt_22A', 'currdebtcredtyperange_828A', 'datefirstoffer_1144D', 'datelastinstal40dpd_247D', 'datelastunpaid_3546854D', 'disbursedcredamount_1113A', 'downpmt_116A', 'dtlastpmtallstes_4499206D', 'firstclxcampaign_1125D', 'firstdatedue_489D', 'inittransactionamount_650A', 'lastactivateddate_801D', 'lastapplicationdate_877D', 'lastapprcredamount_781A', 'lastapprdate_640D', 'lastdelinqdate_224D', 'lastotherinc_902A', 'lastotherlnsexpense_631A', 'lastrejectcredamount_222A', 'lastrejectdate_50D', 'lastrepayingdate_696D', 'maininc_215A', 'maxannuity_159A', 'maxannuity_4075009A', 'maxdebt4_972A', 'maxdpdinstldate_3546855D', 'maxinstallast24m_3658928A', 'maxlnamtstart6m_4525199A', 'maxoutstandbalancel12m_4187113A', 'maxpmtlast3m_4525190A', 'payvacationpostpone_4187118D', 'price_109

##

In [15]:
### debitcard ###
test_debitcard_1_feats_1 = test_debitcard_1.group_by("case_id").agg(
    pl.col("last180dayaveragebalance_704A").max().alias("max_last180dayaveragebalance_704A")
)

### deposit ###
test_deposit_1_feats = test_deposit_1.group_by('case_id').agg(
    pl.sum('amount_416A').alias('sum_deposit_amount_A')
).sort(by='case_id')

data_submission = test_basetable.join(
    test_static.select(["case_id"]+selected_static_cols), how="left", on="case_id"
).join(
    test_static_cb.select(["case_id"]+selected_static_cb_cols), how="left", on="case_id"
).join(
    test_debitcard_1_feats_1, how="left", on="case_id"
).join(
    test_deposit_1_feats, how="left", on="case_id"
)

In [18]:
case_ids = data["case_id"].unique().shuffle(seed=1)
case_ids_train, case_ids_valid = train_test_split(case_ids, train_size=0.8, random_state=1)
# case_ids_valid, case_ids_test = train_test_split(case_ids_test, train_size=0.5, random_state=1)

cols_pred = []
for col in data.columns:
    if col[-1].isupper() and col[:-1].islower():
        cols_pred.append(col)

print(cols_pred)

def from_polars_to_pandas(case_ids: pl.DataFrame) -> pl.DataFrame:
    return (
        data.filter(pl.col("case_id").is_in(case_ids))[["case_id", "WEEK_NUM", "target"]].to_pandas(),
        data.filter(pl.col("case_id").is_in(case_ids))[cols_pred].to_pandas(),
        data.filter(pl.col("case_id").is_in(case_ids))["target"].to_pandas()
    )

base_train, X_train, y_train = from_polars_to_pandas(case_ids_train)
base_valid, X_valid, y_valid = from_polars_to_pandas(case_ids_valid)
# base_test, X_test, y_test = from_polars_to_pandas(case_ids_test)

for df in [X_train, X_valid]: #X_test
    df = convert_strings(df)
    
print('\n')
print(f"Train: {X_train.shape}")
print(f"Valid: {X_valid.shape}")
# print(f"Test: {X_test.shape}")

['amtinstpaidbefduel24m_4187115A', 'annuity_780A', 'annuitynextmonth_57A', 'avginstallast24m_3658937A', 'avglnamtstart24m_4525187A', 'avgoutstandbalancel6m_4187114A', 'avgpmtlast12m_4525200A', 'credamount_770A', 'currdebt_22A', 'currdebtcredtyperange_828A', 'datefirstoffer_1144D', 'datelastinstal40dpd_247D', 'datelastunpaid_3546854D', 'disbursedcredamount_1113A', 'downpmt_116A', 'dtlastpmtallstes_4499206D', 'firstclxcampaign_1125D', 'firstdatedue_489D', 'inittransactionamount_650A', 'lastactivateddate_801D', 'lastapplicationdate_877D', 'lastapprcredamount_781A', 'lastapprdate_640D', 'lastdelinqdate_224D', 'lastotherinc_902A', 'lastotherlnsexpense_631A', 'lastrejectcredamount_222A', 'lastrejectdate_50D', 'lastrepayingdate_696D', 'maininc_215A', 'maxannuity_159A', 'maxannuity_4075009A', 'maxdebt4_972A', 'maxdpdinstldate_3546855D', 'maxinstallast24m_3658928A', 'maxlnamtstart6m_4525199A', 'maxoutstandbalancel12m_4187113A', 'maxpmtlast3m_4525190A', 'payvacationpostpone_4187118D', 'price_109

## Optuna

In [19]:
lgb_train = lgb.Dataset(X_train, label=y_train, params={'feature_pre_filter': False})
lgb_valid = lgb.Dataset(X_valid, label=y_valid, reference=lgb_train, params={'feature_pre_filter': False})

In [20]:
def gini_stability_metric(base, y_pred):
    base['score'] = y_pred
    gini_scores = base.groupby('WEEK_NUM').apply(lambda x: 2 * roc_auc_score(x['target'], x['score']) - 1).tolist()

    x = np.arange(len(gini_scores))
    y = np.array(gini_scores)
    a, b = np.polyfit(x, y, 1)
    residuals = y - (a * x + b)
    res_std = np.std(residuals)
    avg_gini = np.mean(gini_scores)

    stability_score = avg_gini + 88.0 * min(0, a) - 0.5 * res_std
    return stability_score

def objective(trial):
    param = {
        "objective": "binary",
        "metric": "auc",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "max_depth": trial.suggest_int("max_depth", 10, 35),
        "num_leaves": trial.suggest_int("num_leaves", 2, 2 ** trial.suggest_int("max_depth_log2", 3, 5)),
        "min_child_samples": trial.suggest_int("min_child_samples", 10, 200),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 0.8),
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.3, log=True),
        "is_unbalanced": True,
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 1.0),
    }
    
    gbm = lgb.train(
        param,
        lgb_train,
        num_boost_round=trial.suggest_int("n_estimators", 200, 50000),
        valid_sets=[lgb_valid],
        callbacks=[lgb.log_evaluation(50), lgb.early_stopping(10)]
    )

    preds = gbm.predict(X_valid, num_iteration=gbm.best_iteration)
    # AUC 점수는 여기서 계산하지만, 최적화 목적으로 사용하지 않습니다.
    # auc = roc_auc_score(y_valid, preds)
    
    # "gini stability metric" 계산 및 반환
    stability_score = gini_stability_metric(base_valid, preds)
    return stability_score

In [21]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[I 2024-02-20 23:11:48,905] A new study created in memory with name: no-name-7aeec472-788b-40f2-852c-35474842160e


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[31]	valid_0's auc: 0.679465


[I 2024-02-20 23:11:53,686] Trial 0 finished with value: 0.3218876492245737 and parameters: {'max_depth': 17, 'max_depth_log2': 4, 'num_leaves': 4, 'min_child_samples': 49, 'colsample_bytree': 0.7556621903172289, 'learning_rate': 0.0034123338832602276, 'reg_alpha': 0.20835017972515013, 'reg_lambda': 0.4170013191986769, 'n_estimators': 1063}. Best is trial 0 with value: 0.3218876492245737.


Training until validation scores don't improve for 10 rounds
[50]	valid_0's auc: 0.718579
Early stopping, best iteration is:
[43]	valid_0's auc: 0.718813


[I 2024-02-20 23:11:57,244] Trial 1 finished with value: 0.40628186767875246 and parameters: {'max_depth': 11, 'max_depth_log2': 5, 'num_leaves': 13, 'min_child_samples': 182, 'colsample_bytree': 0.6306019053406406, 'learning_rate': 0.0017852186238674801, 'reg_alpha': 0.44978393500910463, 'reg_lambda': 0.0826808612736546, 'n_estimators': 28895}. Best is trial 1 with value: 0.40628186767875246.


Training until validation scores don't improve for 10 rounds
[50]	valid_0's auc: 0.763704
[100]	valid_0's auc: 0.770198
[150]	valid_0's auc: 0.773683
[200]	valid_0's auc: 0.775048
[250]	valid_0's auc: 0.7763
[300]	valid_0's auc: 0.777097
Early stopping, best iteration is:
[339]	valid_0's auc: 0.777657


[I 2024-02-20 23:12:09,541] Trial 2 finished with value: 0.5179061853237601 and parameters: {'max_depth': 24, 'max_depth_log2': 4, 'num_leaves': 4, 'min_child_samples': 138, 'colsample_bytree': 0.710669693931135, 'learning_rate': 0.27841212252546366, 'reg_alpha': 0.47426451007384407, 'reg_lambda': 0.49284008243919886, 'n_estimators': 19325}. Best is trial 2 with value: 0.5179061853237601.


Training until validation scores don't improve for 10 rounds
[50]	valid_0's auc: 0.710253
Early stopping, best iteration is:
[78]	valid_0's auc: 0.712321


[I 2024-02-20 23:12:14,454] Trial 3 finished with value: 0.3935952900710157 and parameters: {'max_depth': 32, 'max_depth_log2': 4, 'num_leaves': 9, 'min_child_samples': 136, 'colsample_bytree': 0.657255977771701, 'learning_rate': 0.0022612602979514827, 'reg_alpha': 0.8726576467195942, 'reg_lambda': 0.0971489500642867, 'n_estimators': 13423}. Best is trial 2 with value: 0.5179061853237601.


Training until validation scores don't improve for 10 rounds
[50]	valid_0's auc: 0.771953
[100]	valid_0's auc: 0.778652
[150]	valid_0's auc: 0.780317
Early stopping, best iteration is:
[184]	valid_0's auc: 0.781013


[I 2024-02-20 23:12:26,559] Trial 4 finished with value: 0.52106876831051 and parameters: {'max_depth': 23, 'max_depth_log2': 5, 'num_leaves': 24, 'min_child_samples': 96, 'colsample_bytree': 0.6881088367229128, 'learning_rate': 0.0904461498174894, 'reg_alpha': 0.00552025874264539, 'reg_lambda': 0.2935689822971852, 'n_estimators': 49358}. Best is trial 4 with value: 0.52106876831051.


Training until validation scores don't improve for 10 rounds
[50]	valid_0's auc: 0.744575
[100]	valid_0's auc: 0.757868
[150]	valid_0's auc: 0.765046
[200]	valid_0's auc: 0.769465
[250]	valid_0's auc: 0.772382
[300]	valid_0's auc: 0.774661
[350]	valid_0's auc: 0.776285
[400]	valid_0's auc: 0.777508
[450]	valid_0's auc: 0.778479
[500]	valid_0's auc: 0.779247
[550]	valid_0's auc: 0.779854
[600]	valid_0's auc: 0.780319
[650]	valid_0's auc: 0.780526
[700]	valid_0's auc: 0.780751
[750]	valid_0's auc: 0.780984
[800]	valid_0's auc: 0.781113
[850]	valid_0's auc: 0.781313
Early stopping, best iteration is:
[851]	valid_0's auc: 0.781318


[I 2024-02-20 23:13:14,165] Trial 5 finished with value: 0.5220398199308052 and parameters: {'max_depth': 26, 'max_depth_log2': 5, 'num_leaves': 18, 'min_child_samples': 100, 'colsample_bytree': 0.6754586709928967, 'learning_rate': 0.02143899830399748, 'reg_alpha': 0.39706358568989353, 'reg_lambda': 0.3927702275840378, 'n_estimators': 27212}. Best is trial 5 with value: 0.5220398199308052.


Training until validation scores don't improve for 10 rounds
[50]	valid_0's auc: 0.72148
[100]	valid_0's auc: 0.728699
[150]	valid_0's auc: 0.734012
[200]	valid_0's auc: 0.739503
[250]	valid_0's auc: 0.744505
[300]	valid_0's auc: 0.749048
[350]	valid_0's auc: 0.752577
[400]	valid_0's auc: 0.755655
[450]	valid_0's auc: 0.758248
[500]	valid_0's auc: 0.760291
[550]	valid_0's auc: 0.761898
[600]	valid_0's auc: 0.763418
[650]	valid_0's auc: 0.764659
[700]	valid_0's auc: 0.765829
[750]	valid_0's auc: 0.766928
[800]	valid_0's auc: 0.767842
[850]	valid_0's auc: 0.768688
[900]	valid_0's auc: 0.769468
[950]	valid_0's auc: 0.770147
[1000]	valid_0's auc: 0.7708
[1050]	valid_0's auc: 0.77139
[1100]	valid_0's auc: 0.771961
[1150]	valid_0's auc: 0.772475
[1200]	valid_0's auc: 0.772942
[1250]	valid_0's auc: 0.773389
[1300]	valid_0's auc: 0.773823
[1350]	valid_0's auc: 0.774231
[1400]	valid_0's auc: 0.774604
[1450]	valid_0's auc: 0.774954
[1500]	valid_0's auc: 0.775298
[1550]	valid_0's auc: 0.7756
[160

[I 2024-02-20 23:16:27,379] Trial 6 finished with value: 0.52026613780338 and parameters: {'max_depth': 27, 'max_depth_log2': 4, 'num_leaves': 12, 'min_child_samples': 61, 'colsample_bytree': 0.6298488358421825, 'learning_rate': 0.006253960482115013, 'reg_alpha': 0.9864492058121037, 'reg_lambda': 0.6853263163688121, 'n_estimators': 20514}. Best is trial 5 with value: 0.5220398199308052.


Training until validation scores don't improve for 10 rounds
[50]	valid_0's auc: 0.761432
[100]	valid_0's auc: 0.769891
[150]	valid_0's auc: 0.774057
[200]	valid_0's auc: 0.776023
[250]	valid_0's auc: 0.777152
[300]	valid_0's auc: 0.778598
[350]	valid_0's auc: 0.779157
[400]	valid_0's auc: 0.779568
[450]	valid_0's auc: 0.780194
Early stopping, best iteration is:
[470]	valid_0's auc: 0.780312


[I 2024-02-20 23:16:48,153] Trial 7 finished with value: 0.5214766809290632 and parameters: {'max_depth': 25, 'max_depth_log2': 5, 'num_leaves': 6, 'min_child_samples': 105, 'colsample_bytree': 0.7254911753285214, 'learning_rate': 0.12256164256819461, 'reg_alpha': 0.7194000749698091, 'reg_lambda': 0.24876375877075407, 'n_estimators': 15731}. Best is trial 5 with value: 0.5220398199308052.


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[31]	valid_0's auc: 0.651716


[I 2024-02-20 23:16:50,021] Trial 8 finished with value: 0.2530600095562022 and parameters: {'max_depth': 24, 'max_depth_log2': 3, 'num_leaves': 2, 'min_child_samples': 59, 'colsample_bytree': 0.6873222504802832, 'learning_rate': 0.004713630299805104, 'reg_alpha': 0.12419490067426042, 'reg_lambda': 0.3220784499592948, 'n_estimators': 25570}. Best is trial 5 with value: 0.5220398199308052.


Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[33]	valid_0's auc: 0.683422


[I 2024-02-20 23:16:52,622] Trial 9 finished with value: 0.32950280273636245 and parameters: {'max_depth': 31, 'max_depth_log2': 3, 'num_leaves': 4, 'min_child_samples': 150, 'colsample_bytree': 0.7182519913817431, 'learning_rate': 0.0034370614408440713, 'reg_alpha': 0.617279133402338, 'reg_lambda': 0.16392252346346414, 'n_estimators': 16175}. Best is trial 5 with value: 0.5220398199308052.


Training until validation scores don't improve for 10 rounds
[50]	valid_0's auc: 0.753346
[100]	valid_0's auc: 0.763153
[150]	valid_0's auc: 0.768783
[200]	valid_0's auc: 0.772508
[250]	valid_0's auc: 0.775048
[300]	valid_0's auc: 0.777208
[350]	valid_0's auc: 0.778815
[400]	valid_0's auc: 0.780058
[450]	valid_0's auc: 0.780957
[500]	valid_0's auc: 0.78158
[550]	valid_0's auc: 0.782004
[600]	valid_0's auc: 0.782295
[650]	valid_0's auc: 0.782543
Early stopping, best iteration is:
[680]	valid_0's auc: 0.782616


[I 2024-02-20 23:17:48,040] Trial 10 finished with value: 0.5242241099128041 and parameters: {'max_depth': 19, 'max_depth_log2': 5, 'num_leaves': 31, 'min_child_samples': 23, 'colsample_bytree': 0.5298574420814703, 'learning_rate': 0.019478058285916436, 'reg_alpha': 0.3106879951233207, 'reg_lambda': 0.9932735532466175, 'n_estimators': 36447}. Best is trial 10 with value: 0.5242241099128041.


Training until validation scores don't improve for 10 rounds
[50]	valid_0's auc: 0.753421
[100]	valid_0's auc: 0.762818
[150]	valid_0's auc: 0.768521
[200]	valid_0's auc: 0.772171
[250]	valid_0's auc: 0.774816
[300]	valid_0's auc: 0.776931
[350]	valid_0's auc: 0.778478
[400]	valid_0's auc: 0.779811
[450]	valid_0's auc: 0.780672
[500]	valid_0's auc: 0.781319
[550]	valid_0's auc: 0.781782
[600]	valid_0's auc: 0.782049
[650]	valid_0's auc: 0.782272
[700]	valid_0's auc: 0.782435
[750]	valid_0's auc: 0.782567
[800]	valid_0's auc: 0.782672
Early stopping, best iteration is:
[825]	valid_0's auc: 0.782742


[I 2024-02-20 23:18:53,437] Trial 11 finished with value: 0.5241503145290927 and parameters: {'max_depth': 17, 'max_depth_log2': 5, 'num_leaves': 32, 'min_child_samples': 11, 'colsample_bytree': 0.5238337382968046, 'learning_rate': 0.01890490092465844, 'reg_alpha': 0.3131518159473751, 'reg_lambda': 0.9112419433229066, 'n_estimators': 38150}. Best is trial 10 with value: 0.5242241099128041.


Training until validation scores don't improve for 10 rounds
[50]	valid_0's auc: 0.75499
[100]	valid_0's auc: 0.764254
[150]	valid_0's auc: 0.769777
[200]	valid_0's auc: 0.773323
[250]	valid_0's auc: 0.776015
[300]	valid_0's auc: 0.778069
[350]	valid_0's auc: 0.77967
[400]	valid_0's auc: 0.780642
[450]	valid_0's auc: 0.781443
[500]	valid_0's auc: 0.781991
[550]	valid_0's auc: 0.782396
[600]	valid_0's auc: 0.782629
[650]	valid_0's auc: 0.782844
[700]	valid_0's auc: 0.782937
[750]	valid_0's auc: 0.783019
Early stopping, best iteration is:
[779]	valid_0's auc: 0.783102


[I 2024-02-20 23:19:54,471] Trial 12 finished with value: 0.5251319357300974 and parameters: {'max_depth': 17, 'max_depth_log2': 5, 'num_leaves': 32, 'min_child_samples': 19, 'colsample_bytree': 0.5120881396228946, 'learning_rate': 0.02052442680800637, 'reg_alpha': 0.2822492861528089, 'reg_lambda': 0.9909184682098109, 'n_estimators': 40163}. Best is trial 12 with value: 0.5251319357300974.


Training until validation scores don't improve for 10 rounds
[50]	valid_0's auc: 0.765372
[100]	valid_0's auc: 0.774372
[150]	valid_0's auc: 0.778498
[200]	valid_0's auc: 0.780618
[250]	valid_0's auc: 0.781704
[300]	valid_0's auc: 0.782068
[350]	valid_0's auc: 0.782395
[400]	valid_0's auc: 0.782617
Early stopping, best iteration is:
[437]	valid_0's auc: 0.782721


[I 2024-02-20 23:20:28,110] Trial 13 finished with value: 0.5234803842454631 and parameters: {'max_depth': 17, 'max_depth_log2': 5, 'num_leaves': 32, 'min_child_samples': 12, 'colsample_bytree': 0.5182587305845543, 'learning_rate': 0.046147284791761535, 'reg_alpha': 0.29920371771621873, 'reg_lambda': 0.9800893403414597, 'n_estimators': 39879}. Best is trial 12 with value: 0.5251319357300974.


Training until validation scores don't improve for 10 rounds
[50]	valid_0's auc: 0.741639
[100]	valid_0's auc: 0.750162
[150]	valid_0's auc: 0.755569
[200]	valid_0's auc: 0.760633
[250]	valid_0's auc: 0.764249
[300]	valid_0's auc: 0.767218
[350]	valid_0's auc: 0.769423
[400]	valid_0's auc: 0.771243
[450]	valid_0's auc: 0.772829
[500]	valid_0's auc: 0.774128
[550]	valid_0's auc: 0.775217
[600]	valid_0's auc: 0.776167
[650]	valid_0's auc: 0.777043
[700]	valid_0's auc: 0.777855
[750]	valid_0's auc: 0.778465
[800]	valid_0's auc: 0.778999
[850]	valid_0's auc: 0.779469
[900]	valid_0's auc: 0.77988
[950]	valid_0's auc: 0.780255
[1000]	valid_0's auc: 0.780586
[1050]	valid_0's auc: 0.780847
[1100]	valid_0's auc: 0.781104
[1150]	valid_0's auc: 0.78128
[1200]	valid_0's auc: 0.781443
[1250]	valid_0's auc: 0.7816
[1300]	valid_0's auc: 0.781716
[1350]	valid_0's auc: 0.7818
[1400]	valid_0's auc: 0.781903
[1450]	valid_0's auc: 0.781971
[1500]	valid_0's auc: 0.782026
[1550]	valid_0's auc: 0.782088
[160

[I 2024-02-20 23:22:34,240] Trial 14 finished with value: 0.5237127430090293 and parameters: {'max_depth': 12, 'max_depth_log2': 5, 'num_leaves': 26, 'min_child_samples': 33, 'colsample_bytree': 0.5736920222114028, 'learning_rate': 0.009908505405990658, 'reg_alpha': 0.6039918691802593, 'reg_lambda': 0.7949931862363877, 'n_estimators': 36132}. Best is trial 12 with value: 0.5251319357300974.


Training until validation scores don't improve for 10 rounds
[50]	valid_0's auc: 0.754241
[100]	valid_0's auc: 0.766891
[150]	valid_0's auc: 0.772214
[200]	valid_0's auc: 0.775443
[250]	valid_0's auc: 0.777429
[300]	valid_0's auc: 0.778497
[350]	valid_0's auc: 0.779319
[400]	valid_0's auc: 0.779823
[450]	valid_0's auc: 0.780213
[500]	valid_0's auc: 0.780509
[550]	valid_0's auc: 0.780697
Early stopping, best iteration is:
[555]	valid_0's auc: 0.780707


[I 2024-02-20 23:23:05,017] Trial 15 finished with value: 0.5202381729614092 and parameters: {'max_depth': 18, 'max_depth_log2': 4, 'num_leaves': 16, 'min_child_samples': 71, 'colsample_bytree': 0.5745451145762523, 'learning_rate': 0.039066673180465236, 'reg_alpha': 0.10359645787469102, 'reg_lambda': 0.6874373837880042, 'n_estimators': 46298}. Best is trial 12 with value: 0.5251319357300974.


Training until validation scores don't improve for 10 rounds
[50]	valid_0's auc: 0.744972
[100]	valid_0's auc: 0.752864
[150]	valid_0's auc: 0.758587
[200]	valid_0's auc: 0.763063
[250]	valid_0's auc: 0.766321
[300]	valid_0's auc: 0.768823
[350]	valid_0's auc: 0.770954
[400]	valid_0's auc: 0.772669
[450]	valid_0's auc: 0.774119
[500]	valid_0's auc: 0.775346
[550]	valid_0's auc: 0.776403
[600]	valid_0's auc: 0.777318
[650]	valid_0's auc: 0.778163
[700]	valid_0's auc: 0.778926
[750]	valid_0's auc: 0.779526
[800]	valid_0's auc: 0.780048
[850]	valid_0's auc: 0.78042
[900]	valid_0's auc: 0.78075
[950]	valid_0's auc: 0.781055
[1000]	valid_0's auc: 0.781268
[1050]	valid_0's auc: 0.781441
[1100]	valid_0's auc: 0.781601
[1150]	valid_0's auc: 0.781775
[1200]	valid_0's auc: 0.781926
[1250]	valid_0's auc: 0.782002
[1300]	valid_0's auc: 0.782082
[1350]	valid_0's auc: 0.782149
[1400]	valid_0's auc: 0.782224
[1450]	valid_0's auc: 0.782315
[1500]	valid_0's auc: 0.782373
Early stopping, best iteration 

[I 2024-02-20 23:24:57,906] Trial 16 finished with value: 0.5234088877860734 and parameters: {'max_depth': 20, 'max_depth_log2': 5, 'num_leaves': 27, 'min_child_samples': 37, 'colsample_bytree': 0.5007037318821986, 'learning_rate': 0.010766104055430326, 'reg_alpha': 0.2751180055597221, 'reg_lambda': 0.8316642528063005, 'n_estimators': 32333}. Best is trial 12 with value: 0.5251319357300974.


Training until validation scores don't improve for 10 rounds
[50]	valid_0's auc: 0.741708
[100]	valid_0's auc: 0.757409
[150]	valid_0's auc: 0.764653
[200]	valid_0's auc: 0.768653
[250]	valid_0's auc: 0.77118
[300]	valid_0's auc: 0.773123
[350]	valid_0's auc: 0.774595
[400]	valid_0's auc: 0.775654
[450]	valid_0's auc: 0.776483
[500]	valid_0's auc: 0.77709
[550]	valid_0's auc: 0.777575
[600]	valid_0's auc: 0.777967
[650]	valid_0's auc: 0.77837
[700]	valid_0's auc: 0.778718
[750]	valid_0's auc: 0.779058
[800]	valid_0's auc: 0.779468
[850]	valid_0's auc: 0.779745
[900]	valid_0's auc: 0.780007
[950]	valid_0's auc: 0.780175
[1000]	valid_0's auc: 0.780413
[1050]	valid_0's auc: 0.78059
[1100]	valid_0's auc: 0.780738
[1150]	valid_0's auc: 0.78087
Early stopping, best iteration is:
[1169]	valid_0's auc: 0.780926


[I 2024-02-20 23:25:50,734] Trial 17 finished with value: 0.52080486227584 and parameters: {'max_depth': 15, 'max_depth_log2': 3, 'num_leaves': 8, 'min_child_samples': 78, 'colsample_bytree': 0.5689362838695382, 'learning_rate': 0.03683306195441281, 'reg_alpha': 0.5656576883838806, 'reg_lambda': 0.652978718452601, 'n_estimators': 43422}. Best is trial 12 with value: 0.5251319357300974.


Training until validation scores don't improve for 10 rounds
[50]	valid_0's auc: 0.740236
[100]	valid_0's auc: 0.748771
[150]	valid_0's auc: 0.755605
[200]	valid_0's auc: 0.760656
[250]	valid_0's auc: 0.764412
[300]	valid_0's auc: 0.767265
[350]	valid_0's auc: 0.769486
[400]	valid_0's auc: 0.771121
[450]	valid_0's auc: 0.772657
[500]	valid_0's auc: 0.773951
[550]	valid_0's auc: 0.775051
[600]	valid_0's auc: 0.776021
[650]	valid_0's auc: 0.77685
[700]	valid_0's auc: 0.777565
[750]	valid_0's auc: 0.778206
[800]	valid_0's auc: 0.778755
[850]	valid_0's auc: 0.779247
[900]	valid_0's auc: 0.779662
[950]	valid_0's auc: 0.780057
[1000]	valid_0's auc: 0.780315
[1050]	valid_0's auc: 0.780605
[1100]	valid_0's auc: 0.780818
[1150]	valid_0's auc: 0.780947
[1200]	valid_0's auc: 0.781092
[1250]	valid_0's auc: 0.781214
[1300]	valid_0's auc: 0.781319
[1350]	valid_0's auc: 0.781438
[1400]	valid_0's auc: 0.781529
[1450]	valid_0's auc: 0.781639
[1500]	valid_0's auc: 0.781711
Early stopping, best iteration

[I 2024-02-20 23:27:43,469] Trial 18 finished with value: 0.5221676939533801 and parameters: {'max_depth': 20, 'max_depth_log2': 5, 'num_leaves': 21, 'min_child_samples': 30, 'colsample_bytree': 0.543223738344428, 'learning_rate': 0.01110237593891279, 'reg_alpha': 0.36400315698947583, 'reg_lambda': 0.989783223711838, 'n_estimators': 33558}. Best is trial 12 with value: 0.5251319357300974.


Training until validation scores don't improve for 10 rounds
[50]	valid_0's auc: 0.761997
[100]	valid_0's auc: 0.771701
[150]	valid_0's auc: 0.775576
[200]	valid_0's auc: 0.777493
[250]	valid_0's auc: 0.778572
[300]	valid_0's auc: 0.779208
[350]	valid_0's auc: 0.77973
Early stopping, best iteration is:
[359]	valid_0's auc: 0.779933


[I 2024-02-20 23:28:02,015] Trial 19 finished with value: 0.5192030248312892 and parameters: {'max_depth': 14, 'max_depth_log2': 4, 'num_leaves': 11, 'min_child_samples': 23, 'colsample_bytree': 0.6018217120642098, 'learning_rate': 0.07316678872741428, 'reg_alpha': 0.17448913606932256, 'reg_lambda': 0.826409612211687, 'n_estimators': 41544}. Best is trial 12 with value: 0.5251319357300974.


Training until validation scores don't improve for 10 rounds
[50]	valid_0's auc: 0.754876
[100]	valid_0's auc: 0.765305
[150]	valid_0's auc: 0.77094
[200]	valid_0's auc: 0.774565
[250]	valid_0's auc: 0.777057
[300]	valid_0's auc: 0.778734
[350]	valid_0's auc: 0.779891
[400]	valid_0's auc: 0.780694
[450]	valid_0's auc: 0.781245
[500]	valid_0's auc: 0.78155
[550]	valid_0's auc: 0.781787
[600]	valid_0's auc: 0.782046
[650]	valid_0's auc: 0.782181
[700]	valid_0's auc: 0.782378
Early stopping, best iteration is:
[698]	valid_0's auc: 0.782381


[I 2024-02-20 23:28:54,952] Trial 20 finished with value: 0.5236972877293774 and parameters: {'max_depth': 29, 'max_depth_log2': 5, 'num_leaves': 29, 'min_child_samples': 48, 'colsample_bytree': 0.5484198649554466, 'learning_rate': 0.024690224545429656, 'reg_alpha': 0.014200669762658436, 'reg_lambda': 0.5810437084884772, 'n_estimators': 7861}. Best is trial 12 with value: 0.5251319357300974.


Training until validation scores don't improve for 10 rounds
[50]	valid_0's auc: 0.752263
[100]	valid_0's auc: 0.761329
[150]	valid_0's auc: 0.767091
[200]	valid_0's auc: 0.771114
[250]	valid_0's auc: 0.773896
[300]	valid_0's auc: 0.775969
[350]	valid_0's auc: 0.777734
[400]	valid_0's auc: 0.778907
[450]	valid_0's auc: 0.779892
[500]	valid_0's auc: 0.780798
[550]	valid_0's auc: 0.781368
[600]	valid_0's auc: 0.781714
[650]	valid_0's auc: 0.782009
[700]	valid_0's auc: 0.782197
[750]	valid_0's auc: 0.782369
[800]	valid_0's auc: 0.782495
[850]	valid_0's auc: 0.782598
[900]	valid_0's auc: 0.782716
[950]	valid_0's auc: 0.78281
[1000]	valid_0's auc: 0.782915
Early stopping, best iteration is:
[1008]	valid_0's auc: 0.782923


[I 2024-02-20 23:30:11,998] Trial 21 finished with value: 0.5245461588498567 and parameters: {'max_depth': 21, 'max_depth_log2': 5, 'num_leaves': 32, 'min_child_samples': 12, 'colsample_bytree': 0.5234903842154367, 'learning_rate': 0.017368745401903907, 'reg_alpha': 0.29943069227189306, 'reg_lambda': 0.8812619402797573, 'n_estimators': 38378}. Best is trial 12 with value: 0.5251319357300974.


Training until validation scores don't improve for 10 rounds
[50]	valid_0's auc: 0.748336
[100]	valid_0's auc: 0.756482
[150]	valid_0's auc: 0.762486
[200]	valid_0's auc: 0.766721
[250]	valid_0's auc: 0.769834
[300]	valid_0's auc: 0.772282
[350]	valid_0's auc: 0.774208
[400]	valid_0's auc: 0.775688
[450]	valid_0's auc: 0.777012
[500]	valid_0's auc: 0.778145
[550]	valid_0's auc: 0.77906
[600]	valid_0's auc: 0.779715
[650]	valid_0's auc: 0.780244
[700]	valid_0's auc: 0.780768
[750]	valid_0's auc: 0.781156
[800]	valid_0's auc: 0.781498
[850]	valid_0's auc: 0.781698
[900]	valid_0's auc: 0.781886
[950]	valid_0's auc: 0.78197
[1000]	valid_0's auc: 0.78211
[1050]	valid_0's auc: 0.782221
[1100]	valid_0's auc: 0.78231
[1150]	valid_0's auc: 0.782418
[1200]	valid_0's auc: 0.782483
[1250]	valid_0's auc: 0.782573
[1300]	valid_0's auc: 0.782682
Early stopping, best iteration is:
[1302]	valid_0's auc: 0.782687


[I 2024-02-20 23:31:49,618] Trial 22 finished with value: 0.5250688106630533 and parameters: {'max_depth': 20, 'max_depth_log2': 5, 'num_leaves': 29, 'min_child_samples': 10, 'colsample_bytree': 0.5009838649180754, 'learning_rate': 0.013425534297903618, 'reg_alpha': 0.22930219644485073, 'reg_lambda': 0.9039698885984148, 'n_estimators': 33820}. Best is trial 12 with value: 0.5251319357300974.


Training until validation scores don't improve for 10 rounds
[50]	valid_0's auc: 0.743921
[100]	valid_0's auc: 0.750702
[150]	valid_0's auc: 0.755499
[200]	valid_0's auc: 0.759822
[250]	valid_0's auc: 0.763057
[300]	valid_0's auc: 0.76559
[350]	valid_0's auc: 0.767837
[400]	valid_0's auc: 0.769601
[450]	valid_0's auc: 0.771176
[500]	valid_0's auc: 0.77255
[550]	valid_0's auc: 0.773746
[600]	valid_0's auc: 0.77471
[650]	valid_0's auc: 0.77564
[700]	valid_0's auc: 0.776539
[750]	valid_0's auc: 0.777238
[800]	valid_0's auc: 0.777881
[850]	valid_0's auc: 0.778524
[900]	valid_0's auc: 0.779111
[950]	valid_0's auc: 0.779584
[1000]	valid_0's auc: 0.779975
[1050]	valid_0's auc: 0.780296
[1100]	valid_0's auc: 0.780605
[1150]	valid_0's auc: 0.780924
[1200]	valid_0's auc: 0.781172
[1250]	valid_0's auc: 0.781428
[1300]	valid_0's auc: 0.781613
[1350]	valid_0's auc: 0.781752
[1400]	valid_0's auc: 0.781866
[1450]	valid_0's auc: 0.782011
[1500]	valid_0's auc: 0.782126
[1550]	valid_0's auc: 0.782202
[1

[I 2024-02-20 23:34:17,737] Trial 23 finished with value: 0.5244255948539286 and parameters: {'max_depth': 35, 'max_depth_log2': 5, 'num_leaves': 29, 'min_child_samples': 12, 'colsample_bytree': 0.504933771923954, 'learning_rate': 0.008079555623085828, 'reg_alpha': 0.21854066924723847, 'reg_lambda': 0.8968501492139834, 'n_estimators': 31115}. Best is trial 12 with value: 0.5251319357300974.


Training until validation scores don't improve for 10 rounds
[50]	valid_0's auc: 0.7117
Early stopping, best iteration is:
[49]	valid_0's auc: 0.711911


[I 2024-02-20 23:34:22,029] Trial 24 finished with value: 0.38943539575588243 and parameters: {'max_depth': 21, 'max_depth_log2': 4, 'num_leaves': 15, 'min_child_samples': 42, 'colsample_bytree': 0.7964631384341481, 'learning_rate': 0.0010379286989881816, 'reg_alpha': 0.400629455153668, 'reg_lambda': 0.7611630413012453, 'n_estimators': 45680}. Best is trial 12 with value: 0.5251319357300974.


Training until validation scores don't improve for 10 rounds
[50]	valid_0's auc: 0.742608
[100]	valid_0's auc: 0.751319
[150]	valid_0's auc: 0.758078
[200]	valid_0's auc: 0.762919
[250]	valid_0's auc: 0.766575
[300]	valid_0's auc: 0.769124
[350]	valid_0's auc: 0.771175
[400]	valid_0's auc: 0.772891
[450]	valid_0's auc: 0.774398
[500]	valid_0's auc: 0.775504
[550]	valid_0's auc: 0.776556
[600]	valid_0's auc: 0.777422
[650]	valid_0's auc: 0.778206
[700]	valid_0's auc: 0.778842
[750]	valid_0's auc: 0.779446
[800]	valid_0's auc: 0.779863
[850]	valid_0's auc: 0.780233
[900]	valid_0's auc: 0.78056
[950]	valid_0's auc: 0.780792
[1000]	valid_0's auc: 0.781007
[1050]	valid_0's auc: 0.781185
[1100]	valid_0's auc: 0.781313
[1150]	valid_0's auc: 0.781445
[1200]	valid_0's auc: 0.781585
[1250]	valid_0's auc: 0.781694
[1300]	valid_0's auc: 0.78181
[1350]	valid_0's auc: 0.781893
[1400]	valid_0's auc: 0.781994
[1450]	valid_0's auc: 0.782064
[1500]	valid_0's auc: 0.782142
[1550]	valid_0's auc: 0.782211


[I 2024-02-20 23:36:00,830] Trial 25 finished with value: 0.5238162950308483 and parameters: {'max_depth': 22, 'max_depth_log2': 5, 'num_leaves': 23, 'min_child_samples': 85, 'colsample_bytree': 0.5562204027267873, 'learning_rate': 0.012069919619697179, 'reg_alpha': 0.12628632249525845, 'reg_lambda': 0.9021157517712289, 'n_estimators': 34049}. Best is trial 12 with value: 0.5251319357300974.


Training until validation scores don't improve for 10 rounds
[50]	valid_0's auc: 0.757818
[100]	valid_0's auc: 0.768357
[150]	valid_0's auc: 0.773549
[200]	valid_0's auc: 0.77684
[250]	valid_0's auc: 0.778837
[300]	valid_0's auc: 0.780306
[350]	valid_0's auc: 0.781032
[400]	valid_0's auc: 0.781529
[450]	valid_0's auc: 0.781859
[500]	valid_0's auc: 0.782111
[550]	valid_0's auc: 0.782269
[600]	valid_0's auc: 0.782498
Early stopping, best iteration is:
[600]	valid_0's auc: 0.782498


[I 2024-02-20 23:36:38,799] Trial 26 finished with value: 0.5234276802777613 and parameters: {'max_depth': 14, 'max_depth_log2': 5, 'num_leaves': 29, 'min_child_samples': 61, 'colsample_bytree': 0.596074366818065, 'learning_rate': 0.030208205695251053, 'reg_alpha': 0.2408916735504093, 'reg_lambda': 0.7559359442132918, 'n_estimators': 40168}. Best is trial 12 with value: 0.5251319357300974.


Training until validation scores don't improve for 10 rounds
[50]	valid_0's auc: 0.762175
[100]	valid_0's auc: 0.772487
[150]	valid_0's auc: 0.776623
[200]	valid_0's auc: 0.778667
[250]	valid_0's auc: 0.779771
[300]	valid_0's auc: 0.780346
[350]	valid_0's auc: 0.780841
[400]	valid_0's auc: 0.781385
[450]	valid_0's auc: 0.781665
Early stopping, best iteration is:
[480]	valid_0's auc: 0.781868


[I 2024-02-20 23:37:03,105] Trial 27 finished with value: 0.5223739766321114 and parameters: {'max_depth': 15, 'max_depth_log2': 4, 'num_leaves': 15, 'min_child_samples': 122, 'colsample_bytree': 0.598091047233347, 'learning_rate': 0.06222901225962927, 'reg_alpha': 0.517282572023763, 'reg_lambda': 0.8986395494071254, 'n_estimators': 23224}. Best is trial 12 with value: 0.5251319357300974.


Training until validation scores don't improve for 10 rounds
[50]	valid_0's auc: 0.748273
[100]	valid_0's auc: 0.758279
[150]	valid_0's auc: 0.764605
[200]	valid_0's auc: 0.768795
[250]	valid_0's auc: 0.771763


[W 2024-02-20 23:37:26,063] Trial 28 failed with parameters: {'max_depth': 22, 'max_depth_log2': 5, 'num_leaves': 26, 'min_child_samples': 177, 'colsample_bytree': 0.5336639218897633, 'learning_rate': 0.016785309033485803, 'reg_alpha': 0.0609346143580064, 'reg_lambda': 0.6105994059731199, 'n_estimators': 29360} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "c:\Users\hmk40\anaconda3\Lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\hmk40\AppData\Local\Temp\ipykernel_19204\2238549198.py", line 31, in objective
    gbm = lgb.train(
          ^^^^^^^^^^
  File "c:\Users\hmk40\anaconda3\Lib\site-packages\lightgbm\engine.py", line 276, in train
    booster.update(fobj=fobj)
  File "c:\Users\hmk40\anaconda3\Lib\site-packages\lightgbm\basic.py", line 3891, in update
    _safe_call(_LIB.LGBM_BoosterUpdateOneIter(
               ^^^^^^^^^^^^^^^

KeyboardInterrupt: 

## Training LightGBM

Minimal example of LightGBM training is shown below.

In [None]:
lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_valid = lgb.Dataset(X_valid, label=y_valid, reference=lgb_train)

params = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "max_depth": 3,
    "num_leaves": 31,
    "learning_rate": 0.05,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "n_estimators": 1000,
    "verbose": -1,
}

gbm = lgb.train(
    params,
    lgb_train,
    valid_sets=lgb_valid,
    callbacks=[lgb.log_evaluation(50), lgb.early_stopping(10)]
)

Evaluation with AUC and then comparison with the stability metric is shown below.

In [3]:
for base, X in [(base_train, X_train), (base_valid, X_valid), (base_test, X_test)]:
    y_pred = gbm.predict(X, num_iteration=gbm.best_iteration)
    base["score"] = y_pred

print(f'The AUC score on the train set is: {roc_auc_score(base_train["target"], base_train["score"])}') 
print(f'The AUC score on the valid set is: {roc_auc_score(base_valid["target"], base_valid["score"])}') 
print(f'The AUC score on the test set is: {roc_auc_score(base_test["target"], base_test["score"])}')  


def gini_stability(base, w_fallingrate=88.0, w_resstd=-0.5):
    gini_in_time = base.loc[:, ["WEEK_NUM", "target", "score"]]\
        .sort_values("WEEK_NUM")\
        .groupby("WEEK_NUM")[["target", "score"]]\
        .apply(lambda x: 2*roc_auc_score(x["target"], x["score"])-1).tolist()
    
    x = np.arange(len(gini_in_time))
    y = gini_in_time
    a, b = np.polyfit(x, y, 1)
    y_hat = a*x + b
    residuals = y - y_hat
    res_std = np.std(residuals)
    avg_gini = np.mean(gini_in_time)
    return avg_gini + w_fallingrate * min(0, a) + w_resstd * res_std

stability_score_train = gini_stability(base_train)
stability_score_valid = gini_stability(base_valid)
stability_score_test = gini_stability(base_test)

print('\n')
print(f'The stability score on the train set is: {stability_score_train}') 
print(f'The stability score on the valid set is: {stability_score_valid}') 
print(f'The stability score on the test set is: {stability_score_test}')  

NameError: name 'base_train' is not defined

## Optuna

## Submission

Scoring the submission dataset is below, we need to take care of new categories. Then we save the score as a last step. 

In [24]:
X_submission = data_submission[cols_pred].to_pandas()
X_submission = convert_strings(X_submission)
categorical_cols = X_train.select_dtypes(include=['category']).columns

for col in categorical_cols:
    train_categories = set(X_train[col].cat.categories)
    submission_categories = set(X_submission[col].cat.categories)
    new_categories = submission_categories - train_categories
    X_submission.loc[X_submission[col].isin(new_categories), col] = "Unknown"
    new_dtype = pd.CategoricalDtype(categories=train_categories, ordered=True)
    X_train[col] = X_train[col].astype(new_dtype)
    X_submission[col] = X_submission[col].astype(new_dtype)

y_submission_pred = gbm.predict(X_submission, num_iteration=gbm.best_iteration)


submission = pd.DataFrame({
    "case_id": data_submission["case_id"].to_numpy(),
    "score": y_submission_pred
}).set_index('case_id')
submission.to_csv("./submission.csv")

In [14]:
def objective(trial):
    param = {
        "objective": "binary",
        "metric": "auc",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "num_leaves": trial.suggest_int("num_leaves", 20, 100),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
    }

    gbm = lgb.train(
        param, 
        lgb_train, 
        valid_sets=[lgb_valid], 
        verbose_eval=False, 
        early_stopping_rounds=10
    )

    preds = gbm.predict(X_valid, num_iteration=gbm.best_iteration)
    auc = roc_auc_score(y_valid, preds)
    return auc

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))