In [1]:
local = True
data_transform = False
# transform_method = "standardscaler"  
transform_method = "rankgauss"

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import QuantileTransformer


# About this notebook
- Deberta-v3-large starter code
- pip wheels is [here](https://www.kaggle.com/code/yasufuminakama/pppm-pip-wheels)
- Training notebook is [here](https://www.kaggle.com/code/yasufuminakama/pppm-deberta-v3-large-baseline-w-w-b-train)

If this notebook is helpful, feel free to upvote :)

In [3]:
# ====================================================
# Directory settings
# ====================================================
exp_names = [
            'albert-base-v2',
             "roberta-base",
            "microsoft-mpnet-base",
            "funnel-transformer-large512",
             "deberta-v3-base",
             "microsoft-deberta-large",
             
            ]

# stacking_exp_name = f"4model_stacking_1dcnn_{transform_method}"
stacking_exp_name = f"6model_stacking_lgbm_xgb"
import os
if local:
    INPUT_DIR = '../../data/us-patent-phrase-to-phrase-matching/'
    
    OUTPUT_DIR = f"./output/{stacking_exp_name}/"
else:
    INPUT_DIR = '../input/us-patent-phrase-to-phrase-matching/'
    OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [4]:
def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

# CFG

In [5]:
# ====================================================
# CFG
# ====================================================
class CFG:
    
    #######実行ごとに変更必要の可能性あり
    EXP_NAMES = exp_names
    models=[
            "roberta-base",
            "microsoft/mpnet-base",
            "funnel-transformer-large512",
            "microsoft/deberta-v3-base",
            "microsoft/deberta-v3-large",
            'albert-base-v2',
            
           ]
    folder_names = [
                    "roberta-base",
                    "mpnetbase",
                    "funnellarge512fold4",
                   "debertabase",
                   "debertalarge",
                    
                    'albert-base-v2',
                  ]
    
    use_hidden_states = [
        False,False,
        False, True, False, False
    ]
    
    max_lens = [
        175,125,
        125, 133, 133,127,]
    pass_folds_flg = False
    pass_folds  = [
        -1,
        -1, -1, -1
    ]
    
    #######
    paths = []
    config_paths = []
    model_paths = []
    for EXP_NAME in EXP_NAMES:
        if local:
            path=f"../exp4/output/{EXP_NAME}/"
            config_path=path+'config.pth'
            model_path=f'../exp4/output/{EXP_NAME}/'
            cpc_path = f"{INPUT_DIR}/cpc_texts.pth"
        else:
            path=f"../input/{folder_name}/{EXP_NAME}/"
            config_path=path+'config.pth'
            model_path=f'../input/{folder_name}/'
            cpc_path = "../input/pppm-deberta-v3-large-baseline-w-w-b-train/cpc_texts.pth"
        paths.append(path)
        config_paths.append(config_path)
        model_paths.append(model_path)
        
    num_workers=4
    hidden_states = []
    for model in models:
        if "small" in model:
            hidden_state  = 512
        elif "base" in model:
            hidden_state  = 768
        elif "large" in model:
            hidden_state  = 1024
        elif "xlarge" in model:
            hidden_state =  1536
        hidden_states.append(hidden_state)
    
    batch_size=32
    fc_dropout=0.2
    target_size=1
    max_len=133
    seed=42
    n_fold=4
    trn_fold=[i for i in range(n_fold)]
    pass_fold = []
#     torch.load(CFG.model_path+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
    

# Library

In [6]:
# ====================================================
# Library
# ====================================================
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import shutil
import string
import pickle
import random
import joblib
import itertools
from pathlib import Path
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

import torch
print(f"torch.__version__: {torch.__version__}")
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

if not local:
    os.system('pip uninstall -y transformers')
    os.system('pip uninstall -y tokenizers')
    os.system('python -m pip install --no-index --find-links=../input/pppm-pip-wheels-dataset transformers')
    os.system('python -m pip install --no-index --find-links=../input/pppm-pip-wheels-dataset tokenizers')
import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device("cpu")

print(device)

torch.__version__: 1.9.0+cu111
tokenizers.__version__: 0.11.0
transformers.__version__: 4.16.2
env: TOKENIZERS_PARALLELISM=true
cpu


In [7]:
import numpy as np
import random
import pandas as pd
from copy import deepcopy as dp

from sklearn import preprocessing
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.modules.loss import _WeightedLoss


# Utils

In [8]:
# ====================================================
# Utils
# ====================================================
def get_score(y_true, y_pred):
    score = sp.stats.pearsonr(y_true, y_pred)[0]
    return score




def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

# OOF

In [9]:
# oof_df = pd.read_pickle(CFG.path+'oof_df.pkl')
# labels = oof_df['score'].values
# preds = oof_df['pred'].values
# score = get_score(labels, preds)
# LOGGER.info(f'CV Score: {score:<.4f}')
train = pd.DataFrame()
for path in CFG.paths:
    oof_file = [f for f in os.listdir(path) if "oof_df.pkl" in f][0]
    oof_df = pd.read_pickle(path+oof_file)
    labels = oof_df['score'].values
    preds = oof_df[["id",'pred']]
    score = get_score(labels, preds["pred"])
    LOGGER.info(f'CV Score: {score:<.4f}')
    train = pd.concat([train, oof_df[["pred"]]],axis=1)
train.columns = [c+str(i) for i, c in enumerate(train.columns)]
train["id"] = preds["id"]

CV Score: 0.7817
CV Score: 0.7990
CV Score: 0.8159
CV Score: 0.8485
CV Score: 0.8422
CV Score: 0.8574


In [10]:
_train = pd.read_csv(f"{INPUT_DIR}train.csv")

In [11]:
# train = 
cols = train.columns.tolist()
train = train.merge(_train, on="id")[cols + ["score"]+["anchor"]]
targets = train["score"]

# lightgbm  

In [12]:
import lightgbm as lgb

In [13]:
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

dfx = pd.get_dummies(train, columns=["score"]).groupby(["anchor"], as_index=False).sum()
cols = [c for c in dfx.columns if c.startswith("score_") or c == "anchor"]
dfx = dfx[cols]

mskf = MultilabelStratifiedKFold(n_splits=CFG.n_fold, shuffle=True, random_state=42)
labels = [c for c in dfx.columns if c != "anchor"]
dfx_labels = dfx[labels]
dfx["fold"] = -1

for fold, (trn_, val_) in enumerate(mskf.split(dfx, dfx_labels)):
    print(len(trn_), len(val_))
    dfx.loc[val_, "fold"] = fold

train = train.merge(dfx[["anchor", "fold"]], on="anchor", how="left")
print(train.fold.value_counts())

train.head()

550 183
549 184
550 183
550 183
3    9622
0    9379
1    8860
2    8612
Name: fold, dtype: int64


Unnamed: 0,pred0,pred1,pred2,pred3,pred4,pred5,id,score,anchor,fold
0,0.403073,0.003255,0.030808,0.00088,0.003149,0.000215,54c1e3b9184cb5b6,0.0,abatement,0
1,0.00086,0.259491,0.276099,0.236394,0.24772,0.24168,ef2d4c2e6bbb208d,0.25,abatement,0
2,0.267246,0.478204,0.491518,0.428095,0.501632,0.503771,4c3f2750e7540ab7,0.5,abatement,0
3,0.476264,0.518456,0.506613,0.482324,0.536995,0.479786,bfd7270f57530991,0.5,abatement,0
4,0.458974,0.467527,0.448548,0.273144,0.073328,0.005082,cc96541d4987b399,0.0,abatement,0


In [14]:
class lgbm_cfg:
    n_splits = CFG.n_fold
    SEED = [1999, 42, 2022]
    params = {
        "lambda_l1":100, #大きいほど正則化が大きい
        'boosting_type': 'gbdt',      # GBDTを指定
        'objective': 'regression',    # 回帰を指定
        'metric': 'rmse',             # 回帰の評価関数
        'learning_rate': 0.01,
    }

In [15]:
def train_lgbm(train, seed, fold_num, features, params):
    oof = np.zeros((len(train), 1))
    tr_idx = train[train.fold!=fold_num].index
    va_idx = train[train.fold==fold_num].index
    
    train_x = train.iloc[tr_idx][features]
    train_y = train.iloc[tr_idx]["score"]
    
    valid_x = train.iloc[va_idx][features]
    valid_y = train.iloc[va_idx]["score"]
    
    train_set = lgb.Dataset(train_x, train_y)
    val_set = lgb.Dataset(valid_x, valid_y)

    model = lgb.train(params, 
                          train_set, 
                          num_boost_round = 2500,
                          early_stopping_rounds = 50,
                          valid_sets = [train_set, val_set], 
                          verbose_eval = -1)
    pd.to_pickle(model, f"{OUTPUT_DIR}fold{fold_num}_seed{seed}_lgbm.pkl")
    pred = model.predict(valid_x, num_iteration=model.best_iteration).reshape(-1,1)
    score = get_score(valid_y, pred)
    LOGGER.info(f"seed{seed}, fold{fold_num}, score {score}")
    oof[va_idx] = pred
    return oof

In [16]:
features = train.columns[:len(exp_names)]
features

Index(['pred0', 'pred1', 'pred2', 'pred3', 'pred4', 'pred5'], dtype='object')

In [17]:
oof_df = np.zeros((len(train), 1))
for seed in lgbm_cfg.SEED:
    lgbm_cfg.params["seed"] = seed
    for fold_num in range(lgbm_cfg.n_splits,):
        _oof = train_lgbm(train,seed, fold_num, features, lgbm_cfg.params)
        oof_df += _oof/len(lgbm_cfg.SEED)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1530
[LightGBM] [Info] Number of data points in the train set: 27094, number of used features: 6
[LightGBM] [Info] Start training from score 0.360809
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1746]	training's rmse: 0.129484	valid_1's rmse: 0.126564


seed1999, fold0, score [0.8724947027148824]


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1530
[LightGBM] [Info] Number of data points in the train set: 27613, number of used features: 6
[LightGBM] [Info] Start training from score 0.361243
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1990]	training's rmse: 0.127344	valid_1's rmse: 0.131207


seed1999, fold1, score [0.862598532767224]


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1530
[LightGBM] [Info] Number of data points in the train set: 27861, number of used features: 6
[LightGBM] [Info] Start training from score 0.362397
Training until validation scores don't improve for 50 rounds


seed1999, fold2, score [0.878952672701519]


Early stopping, best iteration is:
[1153]	training's rmse: 0.129287	valid_1's rmse: 0.125553
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1530
[LightGBM] [Info] Number of data points in the train set: 26851, number of used features: 6
[LightGBM] [Info] Start training from score 0.363823
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[2500]	training's rmse: 0.126137	valid_1's rmse: 0.134556


seed1999, fold3, score [0.8523871819952015]


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1530
[LightGBM] [Info] Number of data points in the train set: 27094, number of used features: 6
[LightGBM] [Info] Start training from score 0.360809
Training until validation scores don't improve for 50 rounds


seed42, fold0, score [0.8724947027148824]


Early stopping, best iteration is:
[1746]	training's rmse: 0.129484	valid_1's rmse: 0.126564
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1530
[LightGBM] [Info] Number of data points in the train set: 27613, number of used features: 6
[LightGBM] [Info] Start training from score 0.361243
Training until validation scores don't improve for 50 rounds


seed42, fold1, score [0.862598532767224]


Early stopping, best iteration is:
[1990]	training's rmse: 0.127344	valid_1's rmse: 0.131207
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1530
[LightGBM] [Info] Number of data points in the train set: 27861, number of used features: 6
[LightGBM] [Info] Start training from score 0.362397
Training until validation scores don't improve for 50 rounds


seed42, fold2, score [0.878952672701519]


Early stopping, best iteration is:
[1153]	training's rmse: 0.129287	valid_1's rmse: 0.125553
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1530
[LightGBM] [Info] Number of data points in the train set: 26851, number of used features: 6
[LightGBM] [Info] Start training from score 0.363823
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[2500]	training's rmse: 0.126137	valid_1's rmse: 0.134556


seed42, fold3, score [0.8523871819952015]


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1530
[LightGBM] [Info] Number of data points in the train set: 27094, number of used features: 6
[LightGBM] [Info] Start training from score 0.360809
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1746]	training's rmse: 0.129484	valid_1's rmse: 0.126564


seed2022, fold0, score [0.8724947027148824]


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1530
[LightGBM] [Info] Number of data points in the train set: 27613, number of used features: 6
[LightGBM] [Info] Start training from score 0.361243
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1990]	training's rmse: 0.127344	valid_1's rmse: 0.131207


seed2022, fold1, score [0.862598532767224]


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1530
[LightGBM] [Info] Number of data points in the train set: 27861, number of used features: 6
[LightGBM] [Info] Start training from score 0.362397
Training until validation scores don't improve for 50 rounds


seed2022, fold2, score [0.878952672701519]


Early stopping, best iteration is:
[1153]	training's rmse: 0.129287	valid_1's rmse: 0.125553
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1530
[LightGBM] [Info] Number of data points in the train set: 26851, number of used features: 6
[LightGBM] [Info] Start training from score 0.363823
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[2500]	training's rmse: 0.126137	valid_1's rmse: 0.134556


seed2022, fold3, score [0.8523871819952015]


In [18]:
score = get_score(targets, oof_df)
LOGGER.info(f"lgbm score {score}")

lgbm score [0.86645155278507]


In [19]:
train["score"] = oof_df

train.to_csv(f"{OUTPUT_DIR}oof_lgbm_df.csv",index=False)

# xgboost

In [20]:
import xgboost as xgb

In [21]:
class xgb_cfg:
    params = {
        'objective': 'reg:squarederror','silent':1, 'random_state':None, 
        # 学習用の指標 (RMSE)
        'eval_metric': 'rmse',
    }
    SEED = [1999, 42, 2022]
    n_splits = CFG.n_fold

In [22]:
def train_xgb(train, seed, fold_num, features, params):
    oof = np.zeros((len(train), 1))
    tr_idx = train[train.fold!=fold_num].index
    va_idx = train[train.fold==fold_num].index
    
    train_x = train.iloc[tr_idx][features]
    train_y = train.iloc[tr_idx]["score"]
    
    valid_x = train.iloc[va_idx][features]
    valid_y = train.iloc[va_idx]["score"]
    
    train_set = xgb.DMatrix(train_x, train_y)
    val_set = xgb.DMatrix(valid_x, valid_y)
    
    num_round = 2500
    watchlist = [(train_set, 'train'), (val_set, 'eval')]
    model = xgb.train(params,
                    train_set,#訓練データ
                    num_round,#設定した学習回数
                    early_stopping_rounds=50,
                    evals=watchlist,
                    )
    
    pd.to_pickle(model, f"{OUTPUT_DIR}fold{fold_num}_seed{seed}_xgb.pkl")
    
    val = xgb.DMatrix(valid_x)
    pred = model.predict(val, ntree_limit=model.best_ntree_limit).reshape(-1,1)
    
    score = get_score(valid_y, pred)
    LOGGER.info(f"seed{seed}, fold{fold_num}, score {score}")
    oof[va_idx] = pred
    return oof

In [23]:
oof_df = np.zeros((len(train), 1))
for seed in xgb_cfg.SEED:
    xgb_cfg.params["seed"] = seed
    for fold_num in range(xgb_cfg.n_splits,):
        _oof = train_xgb(train,seed, fold_num, features, xgb_cfg.params)
        oof_df += _oof/len(xgb_cfg.SEED)

Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	train-rmse:0.17744	eval-rmse:0.17661
[1]	train-rmse:0.12484	eval-rmse:0.12442
[2]	train-rmse:0.08799	eval-rmse:0.08791
[3]	train-rmse:0.06220	eval-rmse:0.06235
[4]	train-rmse:0.04413	eval-rmse:0.04457
[5]	train-rmse:0.03156	eval-rmse:0.03234
[6]	train-rmse:0.02285	eval-rmse:0.02402
[7]	train-rmse:0.01692	eval-rmse:0.01851
[8]	train-rmse:0.01298	eval-rmse:0.01500
[9]	train-rmse:0.01044	eval-rmse:0.01288
[10]	train-rmse:0.00886	eval-rmse:0.01163
[11]	train-rmse:0.00792	eval-rmse:0.01091
[12]	train-rmse:0.00737	eval-rmse:0.01053
[13]	train-rmse:0.00702	eval-rmse:0.01028
[14]	train-rmse:0.00677	eval-rmse:0.01009
[15]	train-rmse:0.00661	eval-rmse:0.01002
[16]	train-rmse:0.00651	e

seed1999, fold0, score [0.9990658409395438]


Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	train-rmse:0.17754	eval-rmse:0.17623
[1]	train-rmse:0.12494	eval-rmse:0.12416
[2]	train-rmse:0.08806	eval-rmse:0.08766
[3]	train-rmse:0.06229	eval-rmse:0.06228
[4]	train-rmse:0.04423	eval-rmse:0.04450
[5]	train-rmse:0.03167	eval-rmse:0.03220
[6]	train-rmse:0.02297	eval-rmse:0.02379
[7]	train-rmse:0.01706	eval-rmse:0.01817
[8]	train-rmse:0.01316	eval-rmse:0.01457
[9]	train-rmse:0.01063	eval-rmse:0.01232
[10]	train-rmse:0.00910	eval-rmse:0.01103
[11]	train-rmse:0.00816	eval-rmse:0.01024
[12]	train-rmse:0.00761	eval-rmse:0.00980
[13]	train-rmse:0.00730	eval-rmse:0.00955
[14]	train-rmse:0.00709	eval-rmse:0.00939
[15]	train-rmse:0.00693	eval-rmse:0.00926
[16]	train-rmse:0.00678	e

seed1999, fold1, score [0.9992181620756431]


Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	train-rmse:0.17668	eval-rmse:0.17855
[1]	train-rmse:0.12432	eval-rmse:0.12550
[2]	train-rmse:0.08763	eval-rmse:0.08829
[3]	train-rmse:0.06194	eval-rmse:0.06237
[4]	train-rmse:0.04397	eval-rmse:0.04428
[5]	train-rmse:0.03146	eval-rmse:0.03175
[6]	train-rmse:0.02281	eval-rmse:0.02323
[7]	train-rmse:0.01693	eval-rmse:0.01757
[8]	train-rmse:0.01300	eval-rmse:0.01400
[9]	train-rmse:0.01049	eval-rmse:0.01182
[10]	train-rmse:0.00893	eval-rmse:0.01058
[11]	train-rmse:0.00801	eval-rmse:0.00994
[12]	train-rmse:0.00747	eval-rmse:0.00958
[13]	train-rmse:0.00714	eval-rmse:0.00941
[14]	train-rmse:0.00695	eval-rmse:0.00933
[15]	train-rmse:0.00682	eval-rmse:0.00929
[16]	train-rmse:0.00668	e

seed1999, fold2, score [0.999200988398228]


Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	train-rmse:0.17710	eval-rmse:0.17758
[1]	train-rmse:0.12464	eval-rmse:0.12516
[2]	train-rmse:0.08785	eval-rmse:0.08844
[3]	train-rmse:0.06211	eval-rmse:0.06273
[4]	train-rmse:0.04414	eval-rmse:0.04481
[5]	train-rmse:0.03161	eval-rmse:0.03243
[6]	train-rmse:0.02294	eval-rmse:0.02390
[7]	train-rmse:0.01704	eval-rmse:0.01822
[8]	train-rmse:0.01315	eval-rmse:0.01458
[9]	train-rmse:0.01065	eval-rmse:0.01231
[10]	train-rmse:0.00909	eval-rmse:0.01095
[11]	train-rmse:0.00812	eval-rmse:0.01013
[12]	train-rmse:0.00757	eval-rmse:0.00971
[13]	train-rmse:0.00725	eval-rmse:0.00947
[14]	train-rmse:0.00703	eval-rmse:0.00930
[15]	train-rmse:0.00689	eval-rmse:0.00921
[16]	train-rmse:0.00677	e

seed1999, fold3, score [0.9991618528880426]


Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	train-rmse:0.17744	eval-rmse:0.17661
[1]	train-rmse:0.12484	eval-rmse:0.12442
[2]	train-rmse:0.08799	eval-rmse:0.08791
[3]	train-rmse:0.06220	eval-rmse:0.06235
[4]	train-rmse:0.04413	eval-rmse:0.04457
[5]	train-rmse:0.03156	eval-rmse:0.03234
[6]	train-rmse:0.02285	eval-rmse:0.02402
[7]	train-rmse:0.01692	eval-rmse:0.01851
[8]	train-rmse:0.01298	eval-rmse:0.01500
[9]	train-rmse:0.01044	eval-rmse:0.01288
[10]	train-rmse:0.00886	eval-rmse:0.01163
[11]	train-rmse:0.00792	eval-rmse:0.01091
[12]	train-rmse:0.00737	eval-rmse:0.01053
[13]	train-rmse:0.00702	eval-rmse:0.01028
[14]	train-rmse:0.00677	eval-rmse:0.01009
[15]	train-rmse:0.00661	eval-rmse:0.01002
[16]	train-rmse:0.00651	e

seed42, fold0, score [0.9990658409395438]


Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	train-rmse:0.17754	eval-rmse:0.17623
[1]	train-rmse:0.12494	eval-rmse:0.12416
[2]	train-rmse:0.08806	eval-rmse:0.08766
[3]	train-rmse:0.06229	eval-rmse:0.06228
[4]	train-rmse:0.04423	eval-rmse:0.04450
[5]	train-rmse:0.03167	eval-rmse:0.03220
[6]	train-rmse:0.02297	eval-rmse:0.02379
[7]	train-rmse:0.01706	eval-rmse:0.01817
[8]	train-rmse:0.01316	eval-rmse:0.01457
[9]	train-rmse:0.01063	eval-rmse:0.01232
[10]	train-rmse:0.00910	eval-rmse:0.01103
[11]	train-rmse:0.00816	eval-rmse:0.01024
[12]	train-rmse:0.00761	eval-rmse:0.00980
[13]	train-rmse:0.00730	eval-rmse:0.00955
[14]	train-rmse:0.00709	eval-rmse:0.00939
[15]	train-rmse:0.00693	eval-rmse:0.00926
[16]	train-rmse:0.00678	e

seed42, fold1, score [0.9992181620756431]


Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	train-rmse:0.17668	eval-rmse:0.17855
[1]	train-rmse:0.12432	eval-rmse:0.12550
[2]	train-rmse:0.08763	eval-rmse:0.08829
[3]	train-rmse:0.06194	eval-rmse:0.06237
[4]	train-rmse:0.04397	eval-rmse:0.04428
[5]	train-rmse:0.03146	eval-rmse:0.03175
[6]	train-rmse:0.02281	eval-rmse:0.02323
[7]	train-rmse:0.01693	eval-rmse:0.01757
[8]	train-rmse:0.01300	eval-rmse:0.01400
[9]	train-rmse:0.01049	eval-rmse:0.01182
[10]	train-rmse:0.00893	eval-rmse:0.01058
[11]	train-rmse:0.00801	eval-rmse:0.00994
[12]	train-rmse:0.00747	eval-rmse:0.00958
[13]	train-rmse:0.00714	eval-rmse:0.00941
[14]	train-rmse:0.00695	eval-rmse:0.00933
[15]	train-rmse:0.00682	eval-rmse:0.00929
[16]	train-rmse:0.00668	e

seed42, fold2, score [0.999200988398228]


Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	train-rmse:0.17710	eval-rmse:0.17758
[1]	train-rmse:0.12464	eval-rmse:0.12516
[2]	train-rmse:0.08785	eval-rmse:0.08844
[3]	train-rmse:0.06211	eval-rmse:0.06273
[4]	train-rmse:0.04414	eval-rmse:0.04481
[5]	train-rmse:0.03161	eval-rmse:0.03243
[6]	train-rmse:0.02294	eval-rmse:0.02390
[7]	train-rmse:0.01704	eval-rmse:0.01822
[8]	train-rmse:0.01315	eval-rmse:0.01458
[9]	train-rmse:0.01065	eval-rmse:0.01231
[10]	train-rmse:0.00909	eval-rmse:0.01095
[11]	train-rmse:0.00812	eval-rmse:0.01013
[12]	train-rmse:0.00757	eval-rmse:0.00971
[13]	train-rmse:0.00725	eval-rmse:0.00947
[14]	train-rmse:0.00703	eval-rmse:0.00930
[15]	train-rmse:0.00689	eval-rmse:0.00921
[16]	train-rmse:0.00677	e

seed42, fold3, score [0.9991618528880426]


Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	train-rmse:0.17744	eval-rmse:0.17661
[1]	train-rmse:0.12484	eval-rmse:0.12442
[2]	train-rmse:0.08799	eval-rmse:0.08791
[3]	train-rmse:0.06220	eval-rmse:0.06235
[4]	train-rmse:0.04413	eval-rmse:0.04457
[5]	train-rmse:0.03156	eval-rmse:0.03234
[6]	train-rmse:0.02285	eval-rmse:0.02402
[7]	train-rmse:0.01692	eval-rmse:0.01851
[8]	train-rmse:0.01298	eval-rmse:0.01500
[9]	train-rmse:0.01044	eval-rmse:0.01288
[10]	train-rmse:0.00886	eval-rmse:0.01163
[11]	train-rmse:0.00792	eval-rmse:0.01091
[12]	train-rmse:0.00737	eval-rmse:0.01053
[13]	train-rmse:0.00702	eval-rmse:0.01028
[14]	train-rmse:0.00677	eval-rmse:0.01009
[15]	train-rmse:0.00661	eval-rmse:0.01002
[16]	train-rmse:0.00651	e

seed2022, fold0, score [0.9990658409395438]


Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	train-rmse:0.17754	eval-rmse:0.17623
[1]	train-rmse:0.12494	eval-rmse:0.12416
[2]	train-rmse:0.08806	eval-rmse:0.08766
[3]	train-rmse:0.06229	eval-rmse:0.06228
[4]	train-rmse:0.04423	eval-rmse:0.04450
[5]	train-rmse:0.03167	eval-rmse:0.03220
[6]	train-rmse:0.02297	eval-rmse:0.02379
[7]	train-rmse:0.01706	eval-rmse:0.01817
[8]	train-rmse:0.01316	eval-rmse:0.01457
[9]	train-rmse:0.01063	eval-rmse:0.01232
[10]	train-rmse:0.00910	eval-rmse:0.01103
[11]	train-rmse:0.00816	eval-rmse:0.01024
[12]	train-rmse:0.00761	eval-rmse:0.00980
[13]	train-rmse:0.00730	eval-rmse:0.00955
[14]	train-rmse:0.00709	eval-rmse:0.00939
[15]	train-rmse:0.00693	eval-rmse:0.00926
[16]	train-rmse:0.00678	e

seed2022, fold1, score [0.9992181620756431]


Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	train-rmse:0.17668	eval-rmse:0.17855
[1]	train-rmse:0.12432	eval-rmse:0.12550
[2]	train-rmse:0.08763	eval-rmse:0.08829
[3]	train-rmse:0.06194	eval-rmse:0.06237
[4]	train-rmse:0.04397	eval-rmse:0.04428
[5]	train-rmse:0.03146	eval-rmse:0.03175
[6]	train-rmse:0.02281	eval-rmse:0.02323
[7]	train-rmse:0.01693	eval-rmse:0.01757
[8]	train-rmse:0.01300	eval-rmse:0.01400
[9]	train-rmse:0.01049	eval-rmse:0.01182
[10]	train-rmse:0.00893	eval-rmse:0.01058
[11]	train-rmse:0.00801	eval-rmse:0.00994
[12]	train-rmse:0.00747	eval-rmse:0.00958
[13]	train-rmse:0.00714	eval-rmse:0.00941
[14]	train-rmse:0.00695	eval-rmse:0.00933
[15]	train-rmse:0.00682	eval-rmse:0.00929
[16]	train-rmse:0.00668	e

seed2022, fold2, score [0.999200988398228]


Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	train-rmse:0.17710	eval-rmse:0.17758
[1]	train-rmse:0.12464	eval-rmse:0.12516
[2]	train-rmse:0.08785	eval-rmse:0.08844
[3]	train-rmse:0.06211	eval-rmse:0.06273
[4]	train-rmse:0.04414	eval-rmse:0.04481
[5]	train-rmse:0.03161	eval-rmse:0.03243
[6]	train-rmse:0.02294	eval-rmse:0.02390
[7]	train-rmse:0.01704	eval-rmse:0.01822
[8]	train-rmse:0.01315	eval-rmse:0.01458
[9]	train-rmse:0.01065	eval-rmse:0.01231
[10]	train-rmse:0.00909	eval-rmse:0.01095
[11]	train-rmse:0.00812	eval-rmse:0.01013
[12]	train-rmse:0.00757	eval-rmse:0.00971
[13]	train-rmse:0.00725	eval-rmse:0.00947
[14]	train-rmse:0.00703	eval-rmse:0.00930
[15]	train-rmse:0.00689	eval-rmse:0.00921
[16]	train-rmse:0.00677	e

seed2022, fold3, score [0.9991618528880426]


In [24]:
score = get_score(targets, oof_df)
LOGGER.info(f"xgb score {score}")

xgb score [0.8691720216622354]


In [25]:
train["score"] = oof_df

train.to_csv(f"{OUTPUT_DIR}oof_xgb_df.csv",index=False)

In [27]:
gbdt = (pd.read_csv(f"{OUTPUT_DIR}oof_lgbm_df.csv") + pd.read_csv(f"{OUTPUT_DIR}oof_xgb_df.csv"))["score"]/2

In [28]:
score = get_score(targets, gbdt)
LOGGER.info(f"gbdt averaging {score}")


gbdt averaging 0.8679955827518955
